DATA PREPROCESSING


In [31]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.impute import SimpleImputer
from scipy import stats
from nltk.corpus import stopwords

In [32]:
data = {
    'Age': [25, 30, np.nan, 35, 40, 28, 60, np.nan],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male'],
    'Salary': [50000, 60000, 55000, 70000, np.nan, 48000, 100000, 45000],
    'Date_of_Birth': ['1996-01-08', '1991-06-15', '1993-11-21', '1988-05-30', 
                      '1988-09-10', '1994-12-25', '1985-07-19', '1990-03-10'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 
             'Phoenix', 'Philadelphia', 'San Antonio', 'San Diego'],
    'Job_Type': ['Engineer', 'Scientist', 'Artist', 'Engineer', 
                 'Manager', 'Engineer', 'Artist', 'Manager']
}

In [33]:
df=pd.DataFrame(data)
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type
0,25.0,Male,50000.0,1996-01-08,New York,Engineer
1,30.0,Female,60000.0,1991-06-15,Los Angeles,Scientist
2,,Female,55000.0,1993-11-21,Chicago,Artist
3,35.0,Male,70000.0,1988-05-30,Houston,Engineer
4,40.0,Female,,1988-09-10,Phoenix,Manager
5,28.0,Male,48000.0,1994-12-25,Philadelphia,Engineer
6,60.0,Female,100000.0,1985-07-19,San Antonio,Artist
7,,Male,45000.0,1990-03-10,San Diego,Manager


missing values

In [34]:
imputer=SimpleImputer(strategy='mean')
df[['Age','Salary']]=imputer.fit_transform(df[['Age','Salary']])

df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type
0,25.0,Male,50000.0,1996-01-08,New York,Engineer
1,30.0,Female,60000.0,1991-06-15,Los Angeles,Scientist
2,36.333333,Female,55000.0,1993-11-21,Chicago,Artist
3,35.0,Male,70000.0,1988-05-30,Houston,Engineer
4,40.0,Female,61142.857143,1988-09-10,Phoenix,Manager
5,28.0,Male,48000.0,1994-12-25,Philadelphia,Engineer
6,60.0,Female,100000.0,1985-07-19,San Antonio,Artist
7,36.333333,Male,45000.0,1990-03-10,San Diego,Manager


HANDLING CATEGORICAL DATA ---LABLE ENCODING FOR GENDER

In [35]:
lable_encoder = LabelEncoder()
df["Gender"] = lable_encoder.fit_transform(df["Gender"])
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type
0,25.0,1,50000.0,1996-01-08,New York,Engineer
1,30.0,0,60000.0,1991-06-15,Los Angeles,Scientist
2,36.333333,0,55000.0,1993-11-21,Chicago,Artist
3,35.0,1,70000.0,1988-05-30,Houston,Engineer
4,40.0,0,61142.857143,1988-09-10,Phoenix,Manager
5,28.0,1,48000.0,1994-12-25,Philadelphia,Engineer
6,60.0,0,100000.0,1985-07-19,San Antonio,Artist
7,36.333333,1,45000.0,1990-03-10,San Diego,Manager


In [36]:
df.Job_Type.unique()
df['Age'] = df['Age'].round(0)  # Round to the nearest whole number
df['Salary'] = df['Salary'].round(-3)  # Round salary to the nearest thousand
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type
0,25.0,1,50000.0,1996-01-08,New York,Engineer
1,30.0,0,60000.0,1991-06-15,Los Angeles,Scientist
2,36.0,0,55000.0,1993-11-21,Chicago,Artist
3,35.0,1,70000.0,1988-05-30,Houston,Engineer
4,40.0,0,61000.0,1988-09-10,Phoenix,Manager
5,28.0,1,48000.0,1994-12-25,Philadelphia,Engineer
6,60.0,0,100000.0,1985-07-19,San Antonio,Artist
7,36.0,1,45000.0,1990-03-10,San Diego,Manager


In [None]:
df['Age'] = df['Age'].round(0)  # Round to the nearest whole number
df['Salary'] = df['Salary'].round(-3)  # Round salary to the nearest thousand
df


Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type
0,25.0,1,50000.0,1996-01-08,New York,Engineer
1,30.0,0,60000.0,1991-06-15,Los Angeles,Scientist
2,36.0,0,55000.0,1993-11-21,Chicago,Artist
3,35.0,1,70000.0,1988-05-30,Houston,Engineer
4,40.0,0,61000.0,1988-09-10,Phoenix,Manager
5,28.0,1,48000.0,1994-12-25,Philadelphia,Engineer
6,60.0,0,100000.0,1985-07-19,San Antonio,Artist
7,36.0,1,45000.0,1990-03-10,San Diego,Manager


In [38]:
df["engineer"] = df["Job_Type"].apply(lambda x: 1 if x == "Engineer" else 0)
df["scientist"] = df["Job_Type"].apply(lambda x: 1 if x == "Scientist" else 0)
df["artist"] = df["Job_Type"].apply(lambda x: 1 if x == "Artist" else 0)
df["manager"] = df["Job_Type"].apply(lambda x: 1 if x == "Manager" else 0)
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type,engineer,scientist,artist,manager
0,25.0,1,50000.0,1996-01-08,New York,Engineer,1,0,0,0
1,30.0,0,60000.0,1991-06-15,Los Angeles,Scientist,0,1,0,0
2,36.0,0,55000.0,1993-11-21,Chicago,Artist,0,0,1,0
3,35.0,1,70000.0,1988-05-30,Houston,Engineer,1,0,0,0
4,40.0,0,61000.0,1988-09-10,Phoenix,Manager,0,0,0,1
5,28.0,1,48000.0,1994-12-25,Philadelphia,Engineer,1,0,0,0
6,60.0,0,100000.0,1985-07-19,San Antonio,Artist,0,0,1,0
7,36.0,1,45000.0,1990-03-10,San Diego,Manager,0,0,0,1


In [48]:
df['Date_of_Birth'] = pd.to_datetime(df['Date_of_Birth'])
df['Year_of_Birth'] = df['Date_of_Birth'].dt.year
df['Month_of_Birth'] = df['Date_of_Birth'].dt.month 
df['Day_of_Birth'] = df['Date_of_Birth'].dt.day
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type,engineer,scientist,artist,manager,Year_of_Birth,Month_of_Birth,Day_of_Birth
0,25.0,1,50000.0,1996-01-08,New York,Engineer,1,0,0,0,1996,1,8
1,30.0,0,60000.0,1991-06-15,Los Angeles,Scientist,0,1,0,0,1991,6,15
2,36.0,0,55000.0,1993-11-21,Chicago,Artist,0,0,1,0,1993,11,21
3,35.0,1,70000.0,1988-05-30,Houston,Engineer,1,0,0,0,1988,5,30
4,40.0,0,61000.0,1988-09-10,Phoenix,Manager,0,0,0,1,1988,9,10
5,28.0,1,48000.0,1994-12-25,Philadelphia,Engineer,1,0,0,0,1994,12,25
6,60.0,0,100000.0,1985-07-19,San Antonio,Artist,0,0,1,0,1985,7,19
7,36.0,1,45000.0,1990-03-10,San Diego,Manager,0,0,0,1,1990,3,10


In [49]:
df["City"] = lable_encoder.fit_transform(df["City"])
df["new_york"] = df["City"].apply(lambda x: 1 if x == 0 else 0)
df["los_angeles"] = df["City"].apply(lambda x: 1 if x == 1 else 0)
df["chicago"] = df["City"].apply(lambda x: 1 if x == 2 else 0)
df["houston"] = df["City"].apply(lambda x: 1 if x == 3 else 0)
df["phoenix"] = df["City"].apply(lambda x: 1 if x == 4 else 0)
df["philadelphia"] = df["City"].apply(lambda x: 1 if x == 5 else 0)
df["san_antonio"] = df["City"].apply(lambda x: 1 if x == 6 else 0)
df["san_diego"] = df["City"].apply(lambda x: 1 if x == 7 else 0)
df

Unnamed: 0,Age,Gender,Salary,Date_of_Birth,City,Job_Type,engineer,scientist,artist,manager,...,Month_of_Birth,Day_of_Birth,new_york,los_angeles,chicago,houston,phoenix,philadelphia,san_antonio,san_diego
0,25.0,1,50000.0,1996-01-08,3,Engineer,1,0,0,0,...,1,8,0,0,0,1,0,0,0,0
1,30.0,0,60000.0,1991-06-15,2,Scientist,0,1,0,0,...,6,15,0,0,1,0,0,0,0,0
2,36.0,0,55000.0,1993-11-21,0,Artist,0,0,1,0,...,11,21,1,0,0,0,0,0,0,0
3,35.0,1,70000.0,1988-05-30,1,Engineer,1,0,0,0,...,5,30,0,1,0,0,0,0,0,0
4,40.0,0,61000.0,1988-09-10,5,Manager,0,0,0,1,...,9,10,0,0,0,0,0,1,0,0
5,28.0,1,48000.0,1994-12-25,4,Engineer,1,0,0,0,...,12,25,0,0,0,0,1,0,0,0
6,60.0,0,100000.0,1985-07-19,6,Artist,0,0,1,0,...,7,19,0,0,0,0,0,0,1,0
7,36.0,1,45000.0,1990-03-10,7,Manager,0,0,0,1,...,3,10,0,0,0,0,0,0,0,1


In [None]:

#df.Year_of_Birth.unique()
df.Year_of_Birth.value_counts()

Year_of_Birth
1988    2
1991    1
1996    1
1993    1
1994    1
1985    1
1990    1
Name: count, dtype: int64

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Age             8 non-null      float64       
 1   Gender          8 non-null      int64         
 2   Salary          8 non-null      float64       
 3   Date_of_Birth   8 non-null      datetime64[ns]
 4   City            8 non-null      int64         
 5   Job_Type        8 non-null      object        
 6   engineer        8 non-null      int64         
 7   scientist       8 non-null      int64         
 8   artist          8 non-null      int64         
 9   manager         8 non-null      int64         
 10  Year_of_Birth   8 non-null      int32         
 11  Month_of_Birth  8 non-null      int32         
 12  Day_of_Birth    8 non-null      int32         
 13  new_york        8 non-null      int64         
 14  los_angeles     8 non-null      int64         
 15  chicago   