In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report




In [2]:
df = pd.read_csv("adult_clean.csv")

In [3]:
df.income_num.value_counts(normalize=False)

0    22654
1     7508
Name: income_num, dtype: int64

In [4]:
training_features, test_features, \
training_target, test_target, = train_test_split(df.drop(['income_num'], axis=1),
                                               df['income_num'],
                                               test_size = .1,
                                               random_state=12)


In [5]:
test_features = pd.get_dummies(test_features)
training_features = pd.get_dummies(training_features)

In [6]:
model = LogisticRegression(C=1e10)
model.fit(training_features,training_target)



LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
y_pred = model.predict(test_features)

In [8]:
roc_auc_score(test_target,y_pred)

0.7537397216664184

In [9]:
print(classification_report(test_target,y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      2298
           1       0.73      0.58      0.64       719

    accuracy                           0.85      3017
   macro avg       0.80      0.75      0.77      3017
weighted avg       0.84      0.85      0.84      3017



In [10]:
sm = SMOTE(random_state=12)
x_train_res, y_train_res = sm.fit_sample(training_features, training_target)

In [11]:
model.fit(x_train_res,y_train_res)



LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
y_pred = model.predict(test_features)

In [13]:
roc_auc_score(test_target,y_pred)

0.7707545776638329

In [14]:
print(classification_report(test_target,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83      2298
           1       0.50      0.79      0.61       719

    accuracy                           0.76      3017
   macro avg       0.71      0.77      0.72      3017
weighted avg       0.82      0.76      0.78      3017



In [15]:
model = LogisticRegression(C=1e10, class_weight = 'balanced')

In [16]:
model.fit(training_features,training_target)



LogisticRegression(C=10000000000.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
y_pred = model.predict(test_features)

In [18]:
roc_auc_score(test_target,y_pred)

0.7782291791495538

In [19]:
print(classification_report(test_target,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.75      0.83      2298
           1       0.50      0.81      0.62       719

    accuracy                           0.76      3017
   macro avg       0.71      0.78      0.72      3017
weighted avg       0.82      0.76      0.78      3017



In [20]:
#Train-test split
training_features, test_features, \
training_target, test_target, = train_test_split(df.drop(['income_num'], axis=1),
                                               df['income_num'],
                                               test_size = .1,
                                               random_state=12)

#Ordenamos nuestro test
test = pd.DataFrame(test_features)
test["target"] = test_target

#Hacemos smote sobre nuestro train
sm = SMOTE(random_state=12)
x_train_res, y_train_res = sm.fit_sample(training_features, training_target)

#lo ordenamos en un solo lugar
adult_smote = pd.DataFrame(x_train_res)
adult_smote.columns = training_features.columns
adult_smote["target"] = y_train_res

#Exportamos los dos a csv
test.to_csv("test.csv", encoding='utf-8', index=False)
adult_smote.to_csv("adult_smote.csv", encoding='utf-8', index=False)

In [25]:
adult_smote.target.value_counts()

1    20356
0    20356
Name: target, dtype: int64