In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 1. Create a Sample DataFrame
data = {
    'Age': [25, 30, 35, np.nan, 40, 100, 45, 50, 55, 60],
    'Salary': [50000, 60000, np.nan, 70000, 80000, 90000, 100000, 110000, 120000, 130000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female'],
    'Purchased': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # Target column
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# 2. Handle Missing Values
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill Age NaN with mean
df['Salary'].fillna(df['Salary'].median(), inplace=True)  # Fill Salary NaN with median
print("\nDataFrame after handling missing values:")
print(df)

# 3. Handle Outliers (using IQR)
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

handle_outliers(df, 'Age')
handle_outliers(df, 'Salary')
print("\nDataFrame after handling outliers:")
print(df)

# 4. Encode Categorical Variables (Gender)
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)
print("\nDataFrame after encoding categorical variables:")
print(df)

# 5. Feature Scaling (Standard Scaling for Age and Salary)
scaler = StandardScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
print("\nDataFrame after feature scaling:")
print(df)

# 6. Split Data into Train and Test Sets
X = df.drop('Purchased', axis=1)
y = df['Purchased']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTrain and Test sets:")
print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

# 7. Handle Imbalanced Data (Oversampling with SMOTE)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("\nAfter handling imbalanced data:")
print("X_resampled shape:", X_resampled.shape, "y_resampled shape:", y_resampled.shape)

Original DataFrame:
     Age    Salary  Gender  Purchased
0   25.0   50000.0    Male          0
1   30.0   60000.0  Female          1
2   35.0       NaN  Female          0
3    NaN   70000.0    Male          1
4   40.0   80000.0    Male          0
5  100.0   90000.0  Female          1
6   45.0  100000.0  Female          0
7   50.0  110000.0    Male          1
8   55.0  120000.0    Male          0
9   60.0  130000.0  Female          1

DataFrame after handling missing values:
          Age    Salary  Gender  Purchased
0   25.000000   50000.0    Male          0
1   30.000000   60000.0  Female          1
2   35.000000   90000.0  Female          0
3   48.888889   70000.0    Male          1
4   40.000000   80000.0    Male          0
5  100.000000   90000.0  Female          1
6   45.000000  100000.0  Female          0
7   50.000000  110000.0    Male          1
8   55.000000  120000.0    Male          0
9   60.000000  130000.0  Female          1

DataFrame after handling outliers:
         Ag

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill Age NaN with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)  # Fill Salary NaN with median


In [29]:
data = {
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# 2. One-Hot Encoding for Categorical Variables (Gender and City)
df = pd.get_dummies(df, columns=['Gender', 'City'], drop_first=True)
print("\nDataFrame after One-Hot Encoding:")
print(df)

# 3. Min-Max Scaling (0 to 1) for Numerical Columns (Age and Salary)
scaler = MinMaxScaler()
df[['Age', 'Salary']] = scaler.fit_transform(df[['Age', 'Salary']])
print("\nDataFrame after Min-Max Scaling (0 to 1):")
print(df)

Original DataFrame:
   Age  Salary  Gender         City
0   25   50000    Male     New York
1   30   60000  Female  Los Angeles
2   35   70000  Female      Chicago
3   40   80000    Male     New York
4   45   90000    Male      Chicago

DataFrame after One-Hot Encoding:
   Age  Salary  Gender_Male  City_Los Angeles  City_New York
0   25   50000         True             False           True
1   30   60000        False              True          False
2   35   70000        False             False          False
3   40   80000         True             False           True
4   45   90000         True             False          False

DataFrame after Min-Max Scaling (0 to 1):
    Age  Salary  Gender_Male  City_Los Angeles  City_New York
0  0.00    0.00         True             False           True
1  0.25    0.25        False              True          False
2  0.50    0.50        False             False          False
3  0.75    0.75         True             False           True
4  1.00   

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 1. Create a Sample DataFrame
data = {
    'Age': [25, 30, 35, 40, 45, 25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000, 50000, 60000, 70000, 80000, 90000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male', 'Male'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago', 'New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago'],
    'Experience': [1, 3, 5, 7, 10, 1, 3, 5, 7, 10]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# 2. Dropping Unnecessary Columns
df.drop(['Experience'], axis=1, inplace=True)  # Drop 'Experience' column
print("\nDataFrame after dropping unnecessary columns:")
print(df)

# 3. Checking for Duplicates
print("\nNumber of duplicates:", df.duplicated().sum())
df.drop_duplicates(inplace=True)  # Drop duplicates
print("\nDataFrame after removing duplicates:")
print(df)

# 5. Log Transformation (for Salary)
df['Log_Salary'] = np.log1p(df['Salary'])
print("\nDataFrame after log transformation of 'Salary':")
print(df)

# 6. Correlation Matrix
df = pd.get_dummies(df, columns=['Gender', 'City'], drop_first=True)
correlation_matrix = df.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)

# # 7. Removing Highly Correlated Features
# corr_matrix = df.corr().abs()

# 4. Binning Numerical Data (Age into bins)
df['Age_Group'] = pd.cut(df['Age'], bins=[20, 30, 40, 50], labels=['20-30', '30-40', '40-50'])
print("\nDataFrame after binning 'Age':")
print(df)


Original DataFrame:
   Age  Salary  Gender         City  Experience
0   25   50000    Male     New York           1
1   30   60000  Female  Los Angeles           3
2   35   70000  Female      Chicago           5
3   40   80000    Male     New York           7
4   45   90000    Male      Chicago          10
5   25   50000    Male     New York           1
6   30   60000  Female  Los Angeles           3
7   35   70000  Female      Chicago           5
8   40   80000    Male     New York           7
9   45   90000    Male      Chicago          10

DataFrame after dropping unnecessary columns:
   Age  Salary  Gender         City
0   25   50000    Male     New York
1   30   60000  Female  Los Angeles
2   35   70000  Female      Chicago
3   40   80000    Male     New York
4   45   90000    Male      Chicago
5   25   50000    Male     New York
6   30   60000  Female  Los Angeles
7   35   70000  Female      Chicago
8   40   80000    Male     New York
9   45   90000    Male      Chicago

Number o