In [None]:
# Ignore dumb warning through the iterations
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# General libraries
import numpy as np
import seaborn as sns
import pandas as pd
import time
import matplotlib.pyplot as plt
import joblib

# Scikit learn libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, f1_score

# Classifiers
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import SVC

LDA
QDA

In [None]:
aux = pd.read_csv('../data/clean_data.csv', index_col='Municipality')
aux[['culture_knowledge_of_catalan_speak', 'population_population_by_sex_total']].corr()

In [None]:
# Get data
df = pd.read_csv('../data/clean_data.csv', index_col='Municipality')
dg = pd.read_csv('../data/model_data.csv', index_col='Municipality')
# dg.head()

# Get proper label
labels = df.filter(like='catalan')
label = df.filter(like='catalan').filter(like='speak')
df = df.loc[:, ~df.columns.isin(labels.columns)]

data = label.join(dg, on='Municipality')
# data.head()

# Get trainin data and label
label = data['culture_knowledge_of_catalan_speak'].to_frame()
data = data.drop(['culture_knowledge_of_catalan_speak'], axis=1)

sns.displot(label)

label[:] = pd.qcut(label['culture_knowledge_of_catalan_speak'], q=4, labels=[0,1,2,3])
label[label['culture_knowledge_of_catalan_speak'] == 2] = 1
label[label['culture_knowledge_of_catalan_speak'] == 3] = 2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.33, stratify=label, random_state=42)

Linear Discriminant Analysis

In [None]:
lda = LinearDiscriminantAnalysis(solver='eigen', shrinkage=0.1)
lda.fit(X_train, y_train.values.ravel())

In [None]:
start_time = time.time()
model = LinearDiscriminantAnalysis()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = [
    {'solver': ['svd']},
    {'solver': ['lsqr', 'eigen'], 'shrinkage': [np.arange(0, 1, 0.01)]}
]

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_lda.pkl')

In [None]:
start_time = time.time()
model = QuadraticDiscriminantAnalysis()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'reg_param': np.arange(0, 1, 0.01)
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_qda.pkl')

In [None]:
start_time = time.time()
model = GaussianNB()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'var_smoothing': [1e-9, 1e-7, 1e-3]
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_gaus.pkl')

Logistic Regression

In [None]:
from scipy.stats import loguniform

start_time = time.time()
model = LogisticRegression(random_state=42, max_iter=500)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = [
    {'solver': ['newton-cg'], 'penalty': ['l2'], 'C': loguniform(1e-1, 100), 'class_weight': [None, 'balanced']},
    {'solver': ['lbfgs'], 'penalty': ['l2'], 'C': loguniform(1e-1, 100), 'class_weight': [None, 'balanced']},
    {'solver': ['liblinear'], 'penalty': ['l1', 'l2'], 'C': loguniform(1e-1, 100), 'class_weight': [None, 'balanced']}
    # {'solver': ['saga'], 'penalty': ['elasticnet', 'l1', 'l2', 'none'], 'C': loguniform(1e-3, 100), 'class_weight': [None, 'balanced']}
]

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_lr.pkl')

In [None]:
# def plot_grid_results(grid):
#     import pandas as pd
    
#     data = pd.DataFrame(grid.cv_results_).filter(regex=r'mean_test_score|std_test_score|param_')
#     return data 

# df = plot_grid_results(search)
# df.head()

In [None]:
# sns.lineplot(x='param_C', y='mean_test_score', hue='param_penalty', data=df)

In [None]:
start_time = time.time()
model = SGDClassifier(random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'loss': ['log', 'hinge', 'perceptron'],
    'penalty': ['elasticnet', 'l2'],
    'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.1, 0.2, 0.5, 0.8, 0.9, 0.95, 1],
    'class_weight': [None, 'Balanced']
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_sgd.pkl')

In [None]:
start_time = time.time()
model = KNeighborsClassifier()

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'n_neighbors': range(2, 30, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'kd_tree']
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_knn.pkl')

In [None]:
start_time = time.time()
model = DecisionTreeClassifier(random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'criterion':['gini','entropy'],
    'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_dt.pkl')

In [None]:
start_time = time.time()
model = RandomForestClassifier(oob_score=True, random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

search = RandomizedSearchCV(model, random_grid, n_iter=50, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_rf.pkl')

In [None]:
start_time = time.time()
model = SVC(random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']},
    {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']}
]

search = RandomizedSearchCV(model, random_grid, n_iter=75, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

In [None]:
joblib.dump(search, '../pickles/search_svm.pkl')

In [None]:
import joblib

search_lr = joblib.load('../pickles/search_lr.pkl')

In [None]:
search_lr.best_params_

In [None]:
search_lr.best_score_

Voting Classifier

In [None]:
voting_hard = VotingClassifier([('dt', model_tree), ('rf', model_rf1),
                                ('gnb', gauss_nb),('extratrees',extra_trees)])
voting_hard.fit(X_train, y_train)

voting_soft = VotingClassifier([('dt', model_tree), ('rf', model_rf1),
                                ('gnb', gauss_nb),('extratrees',extra_trees)],voting='soft')
voting_soft.fit(X_train, y_train)

stacky = StackingClassifier(
    estimators=[('rf', rf_model_tuned), ('extratrees', extra_trees_best)],
    final_estimator=GradientBoostingClassifier())

stacky.fit(X_train, y_train)

In [21]:
start_time = time.time()
model = DecisionTreeClassifier(random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'criterion':['gini','entropy'],
    'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]
}

search = RandomizedSearchCV(model, random_grid, n_iter=500, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

Best Score: 0.593880038555827
Best Hyperparameters: {'max_depth': 4, 'criterion': 'gini'}
Time: 4.157963037490845


In [22]:
joblib.dump(search, '../pickles/search_dt.pkl')

['../pickles/search_dt.pkl']

In [23]:
start_time = time.time()
model = RandomForestClassifier(oob_score=True, random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

search = RandomizedSearchCV(model, random_grid, n_iter=50, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

Best Score: 0.6399660566235884
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 60, 'bootstrap': True}
Time: 450.0925464630127


In [24]:
joblib.dump(search, '../pickles/search_rf.pkl')

['../pickles/search_rf.pkl']

In [25]:
start_time = time.time()
model = SVC(random_state=42)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

random_grid = [
    {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']},
    {'kernel': ['linear'], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']},
    {'kernel': ['sigmoid'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000], 'class_weight': [None, 'balanced'], 'decision_function_shape': ['ovr', 'ovo']}
]

search = RandomizedSearchCV(model, random_grid, n_iter=75, scoring='f1_weighted', n_jobs=-1, cv=cv, random_state=42)

result = search.fit(X_train, y_train.values.ravel())
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

elapsed_time = time.time() - start_time
print('Time:', elapsed_time)

Best Score: 0.6794373177251799
Best Hyperparameters: {'kernel': 'sigmoid', 'gamma': 0.001, 'decision_function_shape': 'ovr', 'class_weight': None, 'C': 100}
Time: 1940.068866968155


In [26]:
joblib.dump(search, '../pickles/search_svm.pkl')

['../pickles/search_svm.pkl']

In [27]:
import joblib

search_lr = joblib.load('../pickles/search_lr.pkl')

In [28]:
search_lr.best_params_

{'C': 0.10325142852622374,
 'class_weight': None,
 'penalty': 'l2',
 'solver': 'newton-cg'}

In [29]:
search_lr.best_score_

0.6714406385465467

Voting Classifier

In [None]:
voting_hard = VotingClassifier([('dt', model_tree), ('rf', model_rf1),
                                ('gnb', gauss_nb),('extratrees',extra_trees)])
voting_hard.fit(X_train, y_train)

voting_soft = VotingClassifier([('dt', model_tree), ('rf', model_rf1),
                                ('gnb', gauss_nb),('extratrees',extra_trees)],voting='soft')
voting_soft.fit(X_train, y_train)

stacky = StackingClassifier(
    estimators=[('rf', rf_model_tuned), ('extratrees', extra_trees_best)],
    final_estimator=GradientBoostingClassifier())

stacky.fit(X_train, y_train)