In [88]:
# general packages and libraries
import os
import sys
from collections import defaultdict
import importlib

# data manipulation packages
import numpy as np
import pandas as pd

# data visualizations packages
import matplotlib.pyplot as plt
# to render plots in the notebook
%matplotlib inline

import seaborn as sns
# set a theme for seaborn
sns.set_theme()

# numerical, statistical and machine learning packages and libraries
import xgboost as xgb
from scipy import stats

from sklearn import (
    ensemble,
    tree,
)
from sklearn.base import (
    BaseEstimator, 
    TransformerMixin,
)
from sklearn.pipeline import (
    make_pipeline,
    FeatureUnion, 
    Pipeline,
)
from sklearn.feature_selection import (
    SelectKBest, 
    chi2, 
    mutual_info_classif,
    f_classif,
)
from sklearn.impute import (
    KNNImputer,
    SimpleImputer,
)
from sklearn.preprocessing import (
    OneHotEncoder, 
    OrdinalEncoder, 
    LabelEncoder,
    StandardScaler,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    KFold,
    cross_val_score,
)

from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
#from sklearn.ensemble import (RandomForestClassifier)

from sklearn.linear_model import (
    SGDClassifier,
    LogisticRegression,
) 
from sklearn.metrics import (
    classification_report,
    r2_score, 
    mean_squared_error,
    auc,
    confusion_matrix,
    accuracy_score,
    roc_auc_score,
    roc_curve,
)


# Analysis of StackOverflow Survey

In this notebook we collect several useful tidbits developed or collected during the analysis of the StackOverflow Survey. The examples are built on the survey data.

In [3]:
# create a path string
mypath = os.getcwd()

# upload the datafiles as pandas dataframes
df1 = pd.read_csv(mypath+'/data/survey20_updated.csv', index_col=[0])

# check the uploaded data
df1.shape

(64461, 61)

## Feature selection


In [None]:
# https://github.com/Chancylin/StackOverflow_Survey/blob/main/code/data_process.py

def cal_mutual_info(df, target_var=None, disc_features_only=True):
    """Calculate mutual information for feature selection, based on mutual_info_classif from sklearn.feature_selection.
    :param df: Pandas dataframe
    :param target_var: target variable
    :param disc_features_only: boolean, calculate mutual information for discrete feature only
    :return:
        a Pandas dataframe with mutual information for features
    """
    df = df.copy()

    df_f_type = df.dtypes
    df_f_type = df_f_type.loc[~df_f_type.index.isin([target_var])].copy()
    cols_if_num = df_f_type.apply(lambda x: np.issubdtype(x, np.number))
    discrete_f = ~cols_if_num
    # get all categorical features
    cols_num = cols_if_num[cols_if_num].index.tolist()
    cols_cat = cols_if_num[~cols_if_num].index.tolist()

    for col_cat in cols_cat:
        df[col_cat] = df[col_cat].fillna('Missing')

    for col_num in cols_num:
        df[col_num] = df[col_num].fillna(df[col_num].mean())
        
    enc = OrdinalEncoder()
    df[cols_cat] = enc.fit_transform(df[cols_cat])
    enc = OrdinalEncoder()
    df.loc[:, target_var] = enc.fit_transform(df[[target_var]])

    if not disc_features_only:
        all_features = df_f_type.index.tolist()
        mutual_info = mutual_info_classif(df[all_features], df[target_var].values,
                                          discrete_features=discrete_f,
                                          n_neighbors=20,
                                          random_state=123)
        df_mutual_info = pd.DataFrame(data=zip(all_features, mutual_info), columns=['columns', 'mutual_info'])
        return df_mutual_info
    else:

        mutual_info = mutual_info_classif(df[cols_cat], df[target_var].values,
                                          discrete_features=True)
        df_mutual_info = pd.DataFrame(data=zip(cols_cat, mutual_info), columns=['columns', 'mutual_info'])
        return df_mutual_info


### My approach: step_by_step

In [15]:
# the list of numerical columns
num_cols = df1.select_dtypes(include='float64').columns.to_list()
print(num_cols)

['Age', 'CompTotal', 'ConvertedComp', 'WorkWeekHrs']


In [16]:
# the list of categorical columns, drop the target 'JobSat'
cat_cols = df1.select_dtypes(include='object').columns.to_list()
print(cat_cols)

['MainBranch', 'Hobbyist', 'Age1stCode', 'CompFreq', 'Country', 'CurrencyDesc', 'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith', 'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors', 'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith', 'MiscTechDesireNextYear', 'MiscTechWorkedWith', 'CollabToolsDesireNextYear', 'CollabToolsWorkedWith', 'DevOps', 'DevOpsImpt', 'EdImpt', 'JobHunt', 'JobHuntResearch', 'Learn', 'OffTopic', 'OnboardGood', 'OtherComms', 'Overtime', 'PurchaseResearch', 'PurpleLink', 'SOSites', 'Stuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear', 'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount', 'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength', 'Trans', 'UndergradMajor', 'WebframeDesireNextYear', 'WebframeWorkedWith', 'WelcomeChange', 'YearsCode', 'YearsCodePro', 'DevClass']


In [18]:
# the list of columns with high cardinality - will get a high selection score
multiple = ['DatabaseDesireNextYear', 'DatabaseWorkedWith', 
           'LanguageDesireNextYear', 'LanguageWorkedWith',
            'MiscTechDesireNextYear', 'MiscTechWorkedWith', 
            'CollabToolsDesireNextYear', 'CollabToolsWorkedWith', 
            'PlatformDesireNextYear', 'PlatformWorkedWith', 
            'WebframeDesireNextYear', 'WebframeWorkedWith']

In [20]:
# remove target 'JobSat' and the multiple columns from cat_cols
cat_cols = list(set(cat_cols) - set(multiple))
cat_cols.remove('JobSat')
print(cat_cols)

['Employment', 'Stuck', 'Age1stCode', 'UndergradMajor', 'YearsCodePro', 'Sexuality', 'Ethnicity', 'EdImpt', 'Gender', 'JobSeek', 'DevOpsImpt', 'Trans', 'DevType', 'OpSys', 'Overtime', 'JobFactors', 'OtherComms', 'SOVisitFreq', 'Hobbyist', 'SurveyEase', 'YearsCode', 'PurchaseWhat', 'PurpleLink', 'PurchaseResearch', 'SOSites', 'JobHunt', 'WelcomeChange', 'MainBranch', 'SurveyLength', 'Country', 'OnboardGood', 'SOComm', 'DevClass', 'DevOps', 'JobHuntResearch', 'OrgSize', 'CompFreq', 'EdLevel', 'CurrencyDesc', 'OffTopic', 'SOAccount', 'CurrencySymbol', 'SOPartFreq', 'Learn']


In [25]:
# drop all missing entries in 'JobSat'
df1.dropna(subset=['JobSat'], inplace=True)
# check output
df1['JobSat'].isnull().sum()

0

In [27]:
# fill in 'missing' in all categorical columns in the list
for col in cat_cols:
        df1[col] = df1[col].fillna('missing')
# check outcome
df1.isnull().sum()

MainBranch          0
Hobbyist            0
Age              8839
Age1stCode          0
CompFreq            0
                 ... 
WelcomeChange       0
WorkWeekHrs      4137
YearsCode           0
YearsCodePro        0
DevClass            0
Length: 61, dtype: int64

In [31]:
# fill in 'median' in all categorical columns in the list
for col in num_cols:
        df1[col] = df1[col].fillna(df1[col].median())
# check outcome
df1[num_cols].isnull().sum()

Age              0
CompTotal        0
ConvertedComp    0
WorkWeekHrs      0
dtype: int64

In [152]:
# scale the numerical columns

# create an instance of the scaler
scaler = StandardScaler()

# scale the numerical variables, fit and transform on the straining set
df1[num_cols] = pd.DataFrame(scaler.fit_transform(df1[num_cols]), 
                                columns=df1[num_cols].columns)

In [153]:
# encode the categorical variables
enc = OrdinalEncoder()
df1[cat_cols] = enc.fit_transform(df1[cat_cols])
enc = OrdinalEncoder()
df1.loc[:, 'JobSat'] = enc.fit_transform(df1[['JobSat']])

In [146]:
# feature selection function
# to be rewritten as a custom transformer class
def select_features(X_train, y_train, score_function, kval, no_cols): # add X_test
    """
    Function for feature selection for (discrete) variables.
    INPUT: X_train - input dataframe, must have only discrete or only continuous features, 
                     pre-processed by removing/imputing missing values and encoded
           y_train - target pd.series, pre-processed
           score_function - can be any of the score functions supported by SelectKBest
           kval = the number of best features to return, can be 'all'
           no_cols = number of columns to print
    OUTPUT: dataframe with two columns, one for the column names in X_train, 
            the second for the scores computed, sorted in decreasing order of the scores
            #variant: if transform of X_test set is performed, 
                      it also returns the transformed dataframe
    """
    # create an instance of the selector
    fs = SelectKBest(score_func=score_function, k=kval)
    # fit the selector on the train set and the train target values
    fs.fit(X_train, y_train)
    # transform the train set, it will have only the kbest columns
    X_train_r = fs.transform(X_train) 
    # transform the test set, it will have only the kbest columns
    # X_test_r = fs.transform(X_test) 
    # get column names for kbest columns
    cols_info = fs.get_support(indices=True)
    cols = X_train.iloc[:,cols_info].columns
    # put columns and their scores together in a dataframe
    frame_best = pd.DataFrame(data=zip(cols,fs.scores_), columns = ['cat_columns', 'kbest_scores'])
    return frame_best.sort_values(by='kbest_scores', ascending=False).head(no_cols)
    #return X_r, frame_best

In [147]:
select_features(df1[cat_cols], df1['JobSat'], mutual_info_classif, 'all', 6)

Unnamed: 0,cat_columns,kbest_scores
9,JobSeek,0.095091
30,OnboardGood,0.03125
25,JobHunt,0.025677
38,CurrencyDesc,0.024917
29,Country,0.022552
41,CurrencySymbol,0.016086


In [157]:
# alternate approach, using mutual_info_classif
mutual_info = mutual_info_classif(df1[cat_cols], df1['JobSat'],
                                          discrete_features='auto',
                                          n_neighbors=3,
                                          copy=True,
                                          random_state=42)
df_mutual_info = pd.DataFrame(data=zip(cat_cols, mutual_info), columns=['columns', 'mutual_info'])

## Discretise continuous variables

### Create bins for the WorkWeekHrs column

In [None]:
# create the labels
cut_labels = ['less-10', '10-20', '20-30', '30-40', '40-50', 'more-50']

# define the bins 
m = df1.WorkWeekHrs.max()
cut_bins = [0, 10, 20, 30, 40, 50, m]

# create a new column which contains the new labels
df1['WorkWeek_Bins'] = pd.cut(df1['WorkWeekHrs'], bins=cut_bins, labels=cut_labels)

# check for success
df1['WorkWeek_Bins'].value_counts()

In [None]:
# change the type of the newly created column
df1['WorkWeek_Bins'] = df1['WorkWeek_Bins'].astype('object')

In [None]:
# drop the WorkWeekHrs column
df1.drop(columns = 'WorkWeekHrs', inplace=True);

### Create bins for the ConvertedComp column

In [None]:
# we could use quantile, however I prefer custom bins here
cut_labels = ['less-10K', '10K-30K', '30K-50K', '50K-100K', '100K-200K', 'more-200K']

# define the bins 
m = df1.ConvertedComp.max()
cut_bins = [0, 10000, 30000, 50000, 100000, 200000, m]

# create a new column which contains the new labels
df1['Comp_Bins'] = pd.cut(df1['ConvertedComp'], bins=cut_bins, labels=cut_labels)

# change the type of the newly created column
df1['Comp_Bins'] = df1['Comp_Bins'].astype('object')

# drop the WorkWeekHrs column
df1.drop(columns = 'ConvertedComp', inplace=True);

### Create bins for the Age column

In [None]:
# define the bin edges
cut_labels = ['<20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '>80']

# define the bins 
m = df1.Age.max()
cut_bins = [0, 20, 30, 40, 50, 60, 70, 80, m]

# create a new column which contains the new labels
df1['Age_Bins'] = pd.cut(df1['Age'], bins=cut_bins, labels=cut_labels)

# change the type of the newly created column
df1['Age_Bins'] = df1['Age_Bins'].astype('object')

# drop the WorkWeekHrs column
df1.drop(columns = 'Age', inplace=True);

### Remove the rows that contain mostly missing values

In [None]:
# keep only the rows with at least 10 non-NA values
df1.dropna(thresh=10)

# check the result
df1.shape

In [None]:
df1.JobSat.isnull().sum()

In [None]:
df1.dropna( how='any', subset=['JobSat'], inplace=True)

In [None]:
df1.JobSat.isnull().sum()

### Remove duplicates

In [None]:
# drop duplicate rows, if any
df1.drop_duplicates(subset=None, keep='first', inplace=True)
df1.shape

## Create features and target

Create a dataframe (X) with the features and a pandas series (y) that contains the labels.

In [None]:
# create a copy of the pre-processed dataframe
df2 = df1.copy()

In [None]:
# create the predictors dataframe
X = df2.drop(columns = 'JobSat')

# create the labels
y = df2['JobSat']

# check for success
X.info(), len(y)

In [None]:
# format all the fields as strings in the feature matrix
X = X.astype(str)

## Sample data

We will use $30 \%$ data for testing:

In [None]:
# split the data in train and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# summarize the data
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

## Impute missing values

Now that we have test and train data, we can impute missing values on the training set, and use the trained imputer to fill in the test dataset. I will use the KNN imputer from sklearn.

In [None]:
# create an instance of the imputer
#imputer = KNNImputer(n_neighbors=5)

# fit the imputer on the dataset
#X_train_trans = pd.DataFrame(imputer.fit_transform(X_train), columns = X_train.columns)

# check for success
#X_train_trans.isna().any()
from sklearn.impute import SimpleImputer
def impute_predictors(X_train, X_test):
    imputer = SimpleImputer(strategy='constant', fill_value='missing')
    imputer.fit(X_train)
    X_train_trans = pd.DataFrame(imputer.transform(X_train), columns=X_train.columns)
    X_test_trans = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    return X_train_trans, X_test_trans

In [None]:
X_train_trans, X_test_trans = impute_predictors(X_train, X_test)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, [0])
    ])

In [None]:
X_train_trans.isna().sum()

## Encode the data

The best practice when encoding variables is to fit the encoding on the training dataset, then apply it to the train and test datasets.

The function below named prepare_inputs() takes the input data for the train and test

In [None]:
 # prepare input data
def encode_predictors(X_train, X_test):
	enc = OneHotEncoder(handle_unknown='ignore')
	enc.fit(X_train)
	X_train_enc = enc.transform(X_train)
	X_test_enc = enc.transform(X_test)
	return X_train_enc, X_test_enc

In [None]:
X_train_enc, X_test_enc = encode_predictors(X_train_trans, X_test_trans)

In [None]:
X_train_enc

Regarding the output data, the target, since it is already encoded as an integer with values from 0 to 5, no other encoding steps are needed at this point.

Alternative would be:

In [None]:
# prepare target
#def encode_targets(y_train, y_test):
	#le = LabelEncoder()
	#le.fit(y_train)
	#y_train_enc = le.transform(y_train)
	#y_test_enc = le.transform(y_test)
	#return y_train_enc, y_test_enc

## Refactor code


In [None]:
X_train_fs

In [None]:
# create a path string
mypath = os.getcwd()

# read the data from the file
df = pd.read_csv(mypath+'/data/survey20_updated.csv')
# preprocess, split and process data
preproc_df = uf.preprocess_data(df)
X_train, y_train, X_test, y_test = uf.process_data(preproc_df, 'JobSat')

## Baseline model: K NearestNeighbors

In [None]:
# create an instance of the classifier
knn_clf = KNeighborsClassifier(n_neighbors=5)

# fit the classifier
knn_clf.fit(X_train_fs, y_train)

# predict output values
y_pred = knn_clf.predict(X_test_fs)

In [None]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)
         

## Several other algorithms 

In [None]:
from sklearn.svm import SVC

# create classifier instance
svm_clf = SVC(gamma="auto", random_state=42)
# fit the model
svm_clf.fit(X_train_fs, y_train)

In [None]:
# predict on the test set
y_pred = svm_clf.predict(X_test_fs)

# test one value
y_test.iloc[20],  y_pred[20]

In [None]:
some_digit_scores = svm_clf.decision_function(X_test_fs)
some_digit_scores[20]

In [None]:
np.argmax(some_digit_scores[20])

In [None]:
svm_clf.classes_

In [None]:
y_test.values[0]

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(svm_clf, X_test_fs, y_test, cv=10, scoring="accuracy")

In [None]:
# print evaluation metrics and results

result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred, zero_division=0)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)
         

In [None]:
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=42))
ovr_clf.fit(X_train_fs, y_train)
y_pred = ovr_clf.predict(X_test_fs)

In [None]:
result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred, zero_division=0)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Always scale the input. The most convenient way is to use a pipeline.
sgd_clf = make_pipeline(StandardScaler(with_mean=False),SGDClassifier(max_iter=1000, tol=1e-3))
sgd_clf.fit(X_train_fs, y_train)
y_pred = sgd_clf.predict(X_test_fs)

In [None]:
result1 = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y_test, y_pred, zero_division=0)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y_test,y_pred)  
print('Accuracy: %.3f' %result3)

In [None]:
X = pd.concat([X_train_fs, X_test_fs])
y = pd.concat([y_train, y_test])

In [None]:
from sklearn import model_selection
from sklearn.linear_model import (LogisticRegression)
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import (KNeighborsClassifier)
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier)
import xgboost as xgb
from sklearn.metrics import classification_report

In [None]:
for model in [DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, 
              RandomForestClassifier, SGDClassifier]:
    make_pipeline(StandardScaler(),model())
    classifier = model()
    kfold = model_selection.KFold(n_splits=10)
    classifier.fit(X_train_fs.toarray(), y_train)
    s = model_selection.cross_val_score(classifier, X_test_fs.toarray(),y_test, cv=kfold)
    #result2 = classification_report(y_test, y_pred, zero_division=0)
    #s = model_selection.cross_val_score(cls, X, y, cv=kfold)
    print(f"{model.__name__:22}  CV_Mean:" f"{s.mean():.3f} CV_STD: {s.std():.2f}")

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(X_train_fs, y_train)


In [None]:
y_pred = xgb_model.predict(X_test_fs)

print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
import numpy as np

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

In [None]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)

params = {
    "colsample_bytree": uniform(0.7, 0.3),
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 150), # default 100
    "subsample": uniform(0.6, 0.4)
}

search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, 
                            n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

search.fit(X_train_fs, y_train)

#report_best_scores(search.cv_results_, 1)

In [None]:
search.cv_results_['mean_test_score']

In [None]:
search.best_params_

In [None]:
#from stack
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': stats.randint(150, 1000),
              'learning_rate': stats.uniform(0.01, 0.59),
              'subsample': stats.uniform(0.3, 0.6),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              'colsample_bytree': stats.uniform(0.5, 0.4),
              'min_child_weight': [1, 2, 3, 4]
             }

numFolds = 5
kfold_5 = cross_validation.KFold(n = len(X), shuffle = True, n_folds = numFolds)

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = kfold_5,  
                         n_iter = 5, # you want 5 here not 25 if I understand you correctly 
                         scoring = 'roc_auc', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for train_index, test_index in kfold.split(X_train_fs):   
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
    xgb_model.fit(X_train_fs, y_train)
    
    y_pred = xgb_model.predict(X_test_fs)
    
    scores.append(mean_squared_error(y_test, y_pred))
    
display_scores(np.sqrt(scores))

In [None]:
cls = RandomForestClassifier()
kfold = model_selection.KFold(n_splits=10)
s = model_selection.cross_val_score(cls, X,y, cv=kfold)

In [None]:
cls_rf = RandomForestClassifier()
kfold = model_selection.KFold(n_splits=10)
cls_rf.fit(X_train, y_train)
y_pred = cls_rf.predict(X_test)
s = model_selection.cross_val_score(cls_rf, X_test,y_test, cv=kfold)

In [None]:
print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
df1 = pd.read_csv(mypath+'/data/survey20_updated.csv')

In [None]:
df1.shape

In [None]:
df1 = df1[df1['JobSat'] !=0]

In [None]:
df1.shape

In [None]:
df1.JobSat.value_counts()

In [None]:
# read the data from the file
#df = pd.read_csv(mypath+'/data/survey20_updated.csv')
# preprocess, split and process data
preproc_df1 = uf.preprocess_data(df1)
X1_train, y1_train, X1_test, y1_test = uf.process_data(preproc_df1, 'JobSat')

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Always scale the input. The most convenient way is to use a pipeline.
sgd_clf = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
sgd_clf.fit(X1_train, y1_train)
y1_pred = sgd_clf.predict(X1_test)

In [None]:
result1 = confusion_matrix(y1_test, y1_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y1_test, y1_pred, zero_division=0)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y1_test,y1_pred)  
print('Accuracy: %.3f' %result3)

In [None]:
# create a path string
mypath = os.getcwd()

df2 = pd.read_csv(mypath+'/data/survey20_updated.csv')
df2.columns

In [None]:
# preprocess, split and process data
#preproc_df2 = uf.preprocess_data(df2)
X2_train, y2_train, X2_test, y2_test = uf.process_data(preproc_df2, 'JobSat')

In [None]:
X2 = pd.concat([X2_train, X2_test])
y2 = pd.concat([y2_train, y2_test])

In [None]:
X2.shape

In [None]:
for model in [DecisionTreeClassifier, KNeighborsClassifier, GaussianNB, SVC, 
              RandomForestClassifier, SGDClassifier]:
    make_pipeline(StandardScaler(),model())
    classifier = model()
    kfold = model_selection.KFold(n_splits=10)
    classifier.fit(X2, y2)
    s = model_selection.cross_val_score(classifier, X2,y2, cv=kfold)
    #result2 = classification_report(y_test, y_pred, zero_division=0)
    #s = model_selection.cross_val_score(cls, X, y, cv=kfold)
    print(f"{model.__name__:22}  CV_Mean:" f"{s.mean():.3f} CV_STD: {s.std():.2f}")

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Always scale the input. The most convenient way is to use a pipeline.
sgd_clf = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
sgd_clf.fit(X2_train, y2_train)
y2_pred = sgd_clf.predict(X2_test)

In [None]:
result1 = confusion_matrix(y2_test, y2_pred)
print('Confusion Matrix:')
print(result1)

result2 = classification_report(y2_test, y2_pred, zero_division=0)
print('\nClassification Report:')
print (result2)

result3 = accuracy_score(y2_test,y2_pred)  
print('Accuracy: %.3f' %result3)