## Nume studenti:
- Alexandra Manole
- Teodor Mihaescu

## Grupa: 382

# Data Set 4: Adult
### (Missing values: yes)

In [20]:
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV

In [2]:
data = pd.read_csv('./Datasets/adult.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Age            32561 non-null  int64 
 1   WorkClass      30725 non-null  object
 2   Fnlwgt         32561 non-null  int64 
 3   Education      32561 non-null  object
 4   EducationNum   32561 non-null  int64 
 5   MaritalStatus  32561 non-null  object
 6   Occupation     30718 non-null  object
 7   Relationship   32561 non-null  object
 8   Race           32561 non-null  object
 9   Sex            32561 non-null  object
 10  CapitalGain    32561 non-null  int64 
 11  CapitalLoss    32561 non-null  int64 
 12  HoursPerWeek   32561 non-null  int64 
 13  NativeCountry  31978 non-null  object
 14  Income         32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [3]:
# Missing values imputation
"""
Observam ca avem missing values in 3 dintre atribute:
- WorkClass
- Occupation
- NativeCountry

Din acest motiv vom folosi un fillna pentru calcularea valorilor ce ar putea substitui missing values. Vom crea o noua clasa unknown pentru fiecare valoare lipsa. Nu alegem alta metoda de fill, precum: cea mai itnalnita valoare, medie, mediana, etc. deoarece umplerea atributelor ce contin missing values cu astfel de metode ar putea influenta drastic datele de iesire. (De exemplu: se observa ca sunt inregistrari cu persoane ce castiga anual peste $50k, la workClass avand missingValue, daca am substitui o astfel de valoare lipsa cu never-worked am ajunge sa denaturam ponderile categoriei persoanelor ce nu au lucrat niciodata, acest lucru ducand la coruperea estimarilor finale)
"""
data.fillna('unknown', inplace=True)

In [4]:
# Pregateste y
y = data.loc[:, 'Income']
y = np.where(y == '<=50K', 0, 1).reshape(data.shape[0], 1)
y = pd.DataFrame(y, columns=['Income'])

In [5]:
# Pregateste X (+ One-hot encoding pe datele categoriale din data)
X = data['Age']

data['WorkClass'] = pd.Categorical(data['WorkClass'])
X_categ = pd.get_dummies(data['WorkClass'], prefix='wc')
X = pd.concat([X, X_categ], axis=1)

X = pd.concat([X, data['Fnlwgt']], axis=1)

data['Education'] = pd.Categorical(data['Education'])
X_categ = pd.get_dummies(data['Education'], prefix='ed')
X = pd.concat([X, X_categ], axis=1)

X = pd.concat([X, data['EducationNum']], axis=1)

data['MaritalStatus'] = pd.Categorical(data['MaritalStatus'])
X_categ = pd.get_dummies(data['MaritalStatus'], prefix='ms')
X = pd.concat([X, X_categ], axis=1)

data['Occupation'] = pd.Categorical(data['Occupation'])
X_categ = pd.get_dummies(data['Occupation'], prefix='oc')
X = pd.concat([X, X_categ], axis=1)

data['Relationship'] = pd.Categorical(data['Relationship'])
X_categ = pd.get_dummies(data['Relationship'], prefix='rl')
X = pd.concat([X, X_categ], axis=1)

data['Race'] = pd.Categorical(data['Race'])
X_categ = pd.get_dummies(data['Race'], prefix='rc')
X = pd.concat([X, X_categ], axis=1)

data['Sex'] = pd.Categorical(data['Sex'])
X_categ = pd.get_dummies(data['Sex'], prefix='sx')
X = pd.concat([X, X_categ], axis=1)

X = pd.concat([X, data.loc[:, 'CapitalGain':'HoursPerWeek']], axis=1)

data['NativeCountry'] = pd.Categorical(data['NativeCountry'])
X_categ = pd.get_dummies(data['NativeCountry'], prefix='nc')
X = pd.concat([X, X_categ], axis=1)

In [6]:
# Scaleaza datele
X_columns = X.columns
X_index = X.index

min_max_scaler = preprocessing.MinMaxScaler()
X = pd.DataFrame(min_max_scaler.fit_transform(X), columns=X_columns, index=X_index)

In [7]:
# Separarea setului de date
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, shuffle=True)

In [8]:
# Verifica daca sunt egal distribuite
print(Counter(y_train.iloc[:,0]))
print(Counter(y_test.iloc[:,0]))

Counter({0: 16496, 1: 5211})
Counter({0: 8224, 1: 2630})


## Model 1: KNeighborsClassifier

In [9]:
# Model
"""
"""
model1 = KNeighborsClassifier(n_neighbors=2)
model1.fit(X_train, y_train)
y_hat = model1.predict(X_test)

In [10]:
print(f'Predicted:\n{y_hat}\n')
print(f'Ground truth:\n{np.array(y_test.iloc[:, 0])}\n')
print(f'Failed:\t{sum(y_hat != np.array(y_test.iloc[:, 0]))}')

Predicted:
[0 0 0 ... 0 1 0]

Ground truth:
[0 0 0 ... 0 1 0]

Failed:	2115


In [11]:
# Cross Validation
results_train = cross_validate(model1, X, y, scoring=['accuracy','f1'], return_train_score=True)
results_train

{'fit_time': array([2.16566086, 2.09367871, 2.01396847, 2.08544755, 2.03748941]),
 'score_time': array([28.4744556 , 24.4670825 , 25.42822146, 24.51343679, 25.86290073]),
 'test_accuracy': array([0.80346998, 0.81111794, 0.81219287, 0.80543612, 0.80973587]),
 'train_accuracy': array([0.89461763, 0.89446812, 0.89412262, 0.89316289, 0.89227993]),
 'test_f1': array([0.47281713, 0.49299258, 0.49774127, 0.47967146, 0.48223987]),
 'train_f1': array([0.71986937, 0.7194039 , 0.7182264 , 0.71494418, 0.71190965])}

In [12]:
# Cautare hiperparametrii optimi - GridSearch
parameter_grid = {'n_neighbors': list(range(1, 10)), 'p': [1, 2, 3]}
grid_search = GridSearchCV(estimator = KNeighborsClassifier(), param_grid=parameter_grid, scoring='accuracy', cv=4, return_train_score=True)
grid_search.fit(X_train_mat, y_train_col)
results_gscv = cross_val_score(grid_search, X_test_mat, y_test_col, scoring='accuracy')

NameError: name 'X_train_mat' is not defined

In [13]:
print(results_gscv.mean())
print(grid_search.best_params_)

NameError: name 'results_gscv' is not defined

In [14]:
# Cautare hiperparametrii optimi -RandomSearch
randomized_search = RandomizedSearchCV(estimator = KNeighborsClassifier(), param_distributions=parameter_grid, random_state=0,  scoring='accuracy', cv=4 )
randomized_search.fit(X_train, y_train)
results_rscv = cross_val_score(randomized_search, X_test, y_test, cv=5)

NameError: name 'RandomizedSearchCV' is not defined

In [15]:
print(results_rscv.mean())
print(randomized_search.best_params_)

NameError: name 'results_rscv' is not defined

## Model 2: MLP

In [16]:
# Model
"""
"""
model2 = MLPClassifier(activation='logistic', learning_rate='adaptive')
model2.fit(X_train, y_train)
y_hat = model2.predict(X_test)

In [17]:
print(f'Predicted:\n{y_hat}\n')
print(f'Ground truth:\n{np.array(y_test.iloc[:, 0])}\n')
print(f'Failed:\t{sum(y_hat != np.array(y_test.iloc[:, 0]))}')

Predicted:
[0 0 0 ... 0 1 0]

Ground truth:
[0 0 0 ... 0 1 0]

Failed:	1643


In [18]:
# Cross Validation
results_train = cross_validate(model2, X, y, scoring=['accuracy','f1'], return_train_score=True)
results_train

{'fit_time': array([13.7989645 , 33.48873973, 33.30314517, 37.43271875, 15.30637121]),
 'score_time': array([0.01995087, 0.0199461 , 0.02194023, 0.0169549 , 0.02493477]),
 'test_accuracy': array([0.84692154, 0.84858722, 0.85503686, 0.85933661, 0.85181204]),
 'train_accuracy': array([0.85396192, 0.85888134, 0.85842067, 0.85573343, 0.85281585]),
 'test_f1': array([0.66328943, 0.66778976, 0.6802168 , 0.67191977, 0.66388018]),
 'train_f1': array([0.67952822, 0.68947457, 0.68671424, 0.6652414 , 0.66058782])}

In [19]:
# Cautare hiperparametrii optimi - GridSearch
parameter_grid = {'activation': ['identity'],
'solver': ['sgd'],
'learning_rate': ['constant', 'invscaling', 'adaptive'],
'learning_rate_init': np.linspace(0, 0.5, 10),
'random_state':[0]}
grid_search = GridSearchCV(estimator = MLPClassifier(), param_grid=parameter_grid, scoring='accuracy', cv=4, return_train_score=True)
grid_search.fit(X_train, y_train)
results_gscv = cross_val_score(grid_search, X_test, y_test, scoring='accuracy')

In [21]:
print(results_gscv.mean())
print(grid_search.best_params_)

0.8501938200875809
{'activation': 'identity', 'learning_rate': 'adaptive', 'learning_rate_init': 0.4444444444444444, 'random_state': 0, 'solver': 'sgd'}


In [22]:
# Cautare hiperparametrii optimi -RandomSearch
randomized_search = RandomizedSearchCV(estimator = MLPClassifier(), param_distributions=parameter_grid, random_state=0,  scoring='accuracy', cv=4 )
randomized_search.fit(X_train, y_train)
results_rscv = cross_val_score(randomized_search, X_test, y_test, cv=5)

In [23]:
print(results_rscv.mean())
print(randomized_search.best_params_)

0.849548828610061
{'solver': 'sgd', 'random_state': 0, 'learning_rate_init': 0.4444444444444444, 'learning_rate': 'adaptive', 'activation': 'identity'}


## Model 3: LogisticRegression

In [24]:
# Model
"""
"""
model3 = LogisticRegression(multi_class='ovr', max_iter=1000)
model3.fit(X_train, y_train)
y_hat = model3.predict(X_test)

In [25]:
print(f'Predicted:\n{y_hat}\n')
print(f'Ground truth:\n{np.array(y_test.iloc[:, 0])}\n')
print(f'Failed:\t{sum(y_hat != np.array(y_test.iloc[:, 0]))}')

Predicted:
[0 0 0 ... 0 1 0]

Ground truth:
[0 0 0 ... 0 1 0]

Failed:	1647


In [26]:
# Cross Validation
results_train = cross_validate(model3, X, y, scoring=['accuracy','f1'], return_train_score=True)
results_train

{'fit_time': array([1.25624347, 1.23175883, 1.11982441, 1.15941191, 1.2410562 ]),
 'score_time': array([0.00797868, 0.00997496, 0.00798082, 0.00797892, 0.00801086]),
 'test_accuracy': array([0.84738216, 0.84367322, 0.8507371 , 0.85611179, 0.85273342]),
 'train_accuracy': array([0.85254146, 0.85300779, 0.85170256, 0.8505125 , 0.85081961]),
 'test_f1': array([0.649259  , 0.64180155, 0.65870787, 0.66714032, 0.66597005]),
 'train_f1': array([0.66191356, 0.66468167, 0.66021638, 0.65733897, 0.65622788])}

In [27]:
# Cautare hiperparametrii optimi - GridSearch
parameter_grid = {
    'C' : np.linspace(0, 100, 10),# C = 1/lambda ; C mare => lambda mic
    'solver': ['sag', 'saga', 'lbfgs']}
grid_search = GridSearchCV(estimator = LogisticRegression(), param_grid=parameter_grid, scoring='accuracy', cv=4, return_train_score=True)
grid_search.fit(X_train, y_train)
results_gscv = cross_val_score(grid_search, X_test, y_test, scoring='accuracy')

In [28]:
print(results_gscv.mean())
print(grid_search.best_params_)

0.8491800801091897
{'C': 33.33333333333333, 'solver': 'lbfgs'}


In [29]:
# Cautare hiperparametrii optimi -RandomSearch
randomized_search = RandomizedSearchCV(estimator = LogisticRegression(), param_distributions=parameter_grid, random_state=0,  scoring='accuracy', cv=4 )
randomized_search.fit(X_train, y_train)
results_rscv = cross_val_score(randomized_search, X_test, y_test, cv=5)

In [30]:
print(results_rscv.mean())
print(randomized_search.best_params_)

0.8496407822426753
{'solver': 'lbfgs', 'C': 33.33333333333333}


## Model 4: DecisionTreeClassifier

In [31]:
# Model
"""
"""
model4 = DecisionTreeClassifier(max_depth=2)
model4.fit(X_train, y_train)
y_hat = model4.predict(X_test)

In [32]:
print(f'Predicted:\n{y_hat}\n')
print(f'Ground truth:\n{np.array(y_test.iloc[:, 0])}\n')
print(f'Failed:\t{sum(y_hat != np.array(y_test.iloc[:, 0]))}')

Predicted:
[0 0 0 ... 0 1 0]

Ground truth:
[0 0 0 ... 0 1 0]

Failed:	1873


In [39]:
# Cross Validation
results_train = cross_validate(model4, X, y, scoring=['accuracy','f1'], return_train_score=True)
results_train

{'fit_time': array([0.13376975, 0.09326601, 0.09429097, 0.09075236, 0.08576965]),
 'score_time': array([0.00797796, 0.0080111 , 0.01196861, 0.00598407, 0.00598478]),
 'test_accuracy': array([0.82158759, 0.82693489, 0.82923833, 0.8343059 , 0.82908477]),
 'train_accuracy': array([0.82989097, 0.82855388, 0.82797804, 0.8267112 , 0.82801643]),
 'test_f1': array([0.53594249, 0.55647383, 0.56082148, 0.57063271, 0.56025286]),
 'train_f1': array([0.56202432, 0.55694444, 0.55585291, 0.55342303, 0.55599604])}

In [34]:
# Cautare hiperparametrii optimi - GridSearch
parameter_grid = {'max_depth': list(range(1,5)),
'min_samples_split': np.linspace(1,3,5),
'min_samples_leaf': np.linspace(0.1,0.5,5),
'max_features': ['sqrt', 'log2', None]}
grid_search = GridSearchCV(estimator = DecisionTreeClassifier(), param_grid=parameter_grid, scoring='accuracy', cv=4, return_train_score=True)
grid_search.fit(X_train, y_train)
results_gscv = cross_val_score(grid_search, X_test, y_test, scoring='accuracy')

In [35]:
print(results_gscv.mean())
print(grid_search.best_params_)

0.7576930081701184
{'max_depth': 1, 'max_features': 'sqrt', 'min_samples_leaf': 0.1, 'min_samples_split': 1.0}


In [36]:
# Cautare hiperparametrii optimi -RandomSearch
randomized_search = RandomizedSearchCV(estimator = DecisionTreeClassifier(), param_distributions=parameter_grid, random_state=0,  scoring='accuracy', cv=4 )
randomized_search.fit(X_train, y_train)
results_rscv = cross_val_score(randomized_search, X_test, y_test, cv=5)

In [37]:
print(results_rscv.mean())
print(randomized_search.best_params_)

0.7576930081701184
{'min_samples_split': 1.0, 'min_samples_leaf': 0.2, 'max_features': 'sqrt', 'max_depth': 4}


## Model 5: SVC (Support Vector Classification)

In [38]:
# Model
"""
"""
model5 = SVC(C=2, gamma='auto')
model5.fit(X_train, y_train)
y_hat = model5.predict(X_test)

In [46]:
print(f'Predicted:\n{y_hat}\n')
print(f'Ground truth:\n{np.array(y_test.iloc[:, 0])}\n')
print(f'Failed:\t{sum(y_hat != np.array(y_test.iloc[:, 0]))}')

Predicted:
[0 0 0 ... 0 1 0]

Ground truth:
[0 0 0 ... 0 1 0]

Failed:	1801


In [47]:
# Cross Validation
results_train = cross_validate(model4, X, y, scoring=['accuracy','f1'], return_train_score=True)
results_train

{'fit_time': array([0.10173011, 0.08273077, 0.07682872, 0.07684517, 0.0738039 ]),
 'score_time': array([0.0079782 , 0.00598502, 0.00697732, 0.00553966, 0.00603247]),
 'test_accuracy': array([0.82158759, 0.82693489, 0.82923833, 0.8343059 , 0.82908477]),
 'train_accuracy': array([0.82989097, 0.82855388, 0.82797804, 0.8267112 , 0.82801643]),
 'test_f1': array([0.53594249, 0.55647383, 0.56082148, 0.57063271, 0.56025286]),
 'train_f1': array([0.56202432, 0.55694444, 0.55585291, 0.55342303, 0.55599604])}

In [42]:
# Cautare hiperparametrii optimi - GridSearch
parameter_grid = {'C': np.linspace(0,1,5),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma': ['scale', 'auto']}
grid_search = GridSearchCV(estimator = SVC(), param_grid=parameter_grid, scoring='accuracy', cv=4, return_train_score=True)
grid_search.fit(X_train, y_train)
results_gscv = cross_val_score(grid_search, X_test, y_test, scoring='accuracy')

NameError: name 'y1_train' is not defined

In [43]:
print(results_gscv.mean())
print(grid_search.best_params_)

0.7576930081701184


AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [44]:
# Cautare hiperparametrii optimi -RandomSearch
randomized_search = RandomizedSearchCV(estimator = SVC(), param_distributions=parameter_grid, random_state=0,  scoring='accuracy', cv=4 )
randomized_search.fit(X_train, y_train)
results_rscv = cross_val_score(randomized_search, X_test, y_test, cv=5)

In [45]:
print(results_rscv.mean())
print(randomized_search.best_params_)

0.846876399628959
{'kernel': 'linear', 'gamma': 'auto', 'C': 0.75}
