In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

#for evaluation
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#models
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier

# from xgboost import XGBRegressor
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from yellowbrick.cluster import KElbowVisualizer, silhouette_visualizer
# from sklearn.metrics import silhouette_score
# from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")

In [60]:
file_path='data/MBA/MBA.csv'
df=pd.read_csv(file_path)
df.head()

Unnamed: 0,application_id,gender,international,gpmagoa,major,race,gmat,work_exp,work_industry,admission
0,1,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,2,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,3,Female,True,3.3,Business,,710.0,5.0,Technology,Admit
3,4,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,5,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6194 entries, 0 to 6193
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   application_id  6194 non-null   int64  
 1   gender          6194 non-null   object 
 2   international   6194 non-null   bool   
 3   gpmagoa         6194 non-null   float64
 4   major           6194 non-null   object 
 5   race            4352 non-null   object 
 6   gmat            6194 non-null   float64
 7   work_exp        6194 non-null   float64
 8   work_industry   6194 non-null   object 
 9   admission       1000 non-null   object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 441.7+ KB


In [62]:
df.isna().sum()

application_id       0
gender               0
international        0
gpmagoa              0
major                0
race              1842
gmat                 0
work_exp             0
work_industry        0
admission         5194
dtype: int64

In [63]:
# irrlevant columns
df.drop('application_id', axis=1, inplace=True)

In [64]:
df['race'].unique()

array(['Asian', 'Black', nan, 'Hispanic', 'White', 'Other'], dtype=object)

In [65]:
df['international'].unique()

array([False,  True])

In [66]:
# fill empty values
df['race'] = df['race'].fillna('Unknown')

In [67]:
df['race'].unique()

array(['Asian', 'Black', 'Unknown', 'Hispanic', 'White', 'Other'],
      dtype=object)

In [68]:
df.head()

Unnamed: 0,gender,international,gpmagoa,major,race,gmat,work_exp,work_industry,admission
0,Female,False,3.3,Business,Asian,620.0,3.0,Financial Services,Admit
1,Male,False,3.28,Humanities,Black,680.0,5.0,Investment Management,
2,Female,True,3.3,Business,Unknown,710.0,5.0,Technology,Admit
3,Male,False,3.47,STEM,Black,690.0,6.0,Technology,
4,Male,False,3.35,STEM,Hispanic,590.0,5.0,Consulting,


In [69]:
# convert categorical data
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [70]:
df.isna().sum()

gender              0
international       0
gpmagoa             0
major               0
race                0
gmat                0
work_exp            0
work_industry       0
admission        5194
dtype: int64

In [71]:
# df['international'] = df['international'].map({'True': 1, 'False': 0})

columns = ['major', 'race', 'work_industry']
le=LabelEncoder()
for cols in columns:
    df[cols]=le.fit_transform(df[cols])

In [72]:
df.head()

Unnamed: 0,gender,international,gpmagoa,major,race,gmat,work_exp,work_industry,admission
0,0,False,3.3,0,0,620.0,3.0,3,Admit
1,1,False,3.28,1,1,680.0,5.0,6,
2,0,True,3.3,0,4,710.0,5.0,13,Admit
3,1,False,3.47,2,1,690.0,6.0,13,
4,1,False,3.35,2,2,590.0,5.0,1,


In [18]:
categorical_columns = ['major', 'race', 'work_industry']
X_data = df.drop('admission', axis=1)
X_data = pd.get_dummies(X_data, drop_first=True)
Y_data = df['admission']

X_data.head()

Unnamed: 0,gender,international,gpa,gmat,work_exp,major_Humanities,major_STEM,race_Black,race_Hispanic,race_Other,...,work_industry_Health Care,work_industry_Investment Banking,work_industry_Investment Management,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology
0,0,,3.3,620.0,3.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,,3.28,680.0,5.0,True,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
2,0,,3.3,710.0,5.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,1,,3.47,690.0,6.0,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,True
4,1,,3.35,590.0,5.0,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
Y_data.head()

0    1.0
1    0.0
2    1.0
3    0.0
4    0.0
Name: admission, dtype: float64

In [25]:
df = pd.get_dummies(df,drop_first=True)
df.head()

Unnamed: 0,international,gpa,gmat,work_exp,gender_Female,gender_Male,major_Business,major_Humanities,major_STEM,race_Asian,...,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology,admission_Admit,admission_Deny,admission_Waitlist
0,False,3.3,620.0,3.0,True,False,True,False,False,True,...,False,False,False,False,False,False,False,True,False,False
1,False,3.28,680.0,5.0,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,True,False
2,True,3.3,710.0,5.0,True,False,True,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3,False,3.47,690.0,6.0,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
4,False,3.35,590.0,5.0,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False


In [1]:
df.isna().sum()

NameError: name 'df' is not defined

In [None]:
le = LabelEncoder()