In [8]:
import pandas as pd
import numpy as np

data = {
    'st_id': [101, 102, 103, 104, 105, 106],
    'Name': ['Arun', 'Raju', 'Chitra', 'Sita', 'Ramu', 'Latha'],
    'Maths': [85, np.nan, 75, 88, 60, 999],      # 999 is an outlier, np.nan is missing
    'Science': [78, 92, np.nan, 85, 70, 100]     # Missing value at index 2
}

df = pd.DataFrame(data)
df

Unnamed: 0,st_id,Name,Maths,Science
0,101,Arun,85.0,78.0
1,102,Raju,,92.0
2,103,Chitra,75.0,
3,104,Sita,88.0,85.0
4,105,Ramu,60.0,70.0
5,106,Latha,999.0,100.0


In [12]:
df_selected = df[['Maths', 'Science']]
print("\nAfter Attribute selection:\n", df_selected)



After Attribute selection:
    Maths  Science
0   85.0     78.0
1    NaN     92.0
2   75.0      NaN
3   88.0     85.0
4   60.0     70.0
5  999.0    100.0


In [14]:
df_selected['Maths'].fillna(df_selected['Maths'].mean(), inplace=True)
df_selected['Science'].fillna(df_selected['Science'].mean(), inplace=True)
print("\nAfter Handling Missing Values:\n", df_selected)



After Handling Missing Values:
    Maths  Science
0   85.0     78.0
1  261.4     92.0
2   75.0     85.0
3   88.0     85.0
4   60.0     70.0
5  999.0    100.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_selected['Maths'].fillna(df_selected['Maths'].mean(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Maths'].fillna(df_selected['Maths'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, in

In [19]:
bins = [0, 60, 80, 1000]
labels = ['Low', 'Medium', 'High']
df_selected['Maths_Category'] = pd.cut(df_selected['Maths'], bins=bins, labels=labels)
print("\nAfter Discretization:\n", df_selected)



After Discretization:
    Maths  Science Maths_Category
0   85.0     78.0           High
1  261.4     92.0           High
2   75.0     85.0         Medium
3   88.0     85.0           High
4   60.0     70.0            Low
5  999.0    100.0           High


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Maths_Category'] = pd.cut(df_selected['Maths'], bins=bins, labels=labels)


In [20]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

# Remove outliers from Maths
df_clean = remove_outliers_iqr(df_selected, 'Maths')
print("\nAfter Outlier Elimination:\n", df_clean)



After Outlier Elimination:
    Maths  Science Maths_Category
0   85.0     78.0           High
1  261.4     92.0           High
2   75.0     85.0         Medium
3   88.0     85.0           High
4   60.0     70.0            Low
