# Classification Models on Humor Styles Data

In [139]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings("ignore")

In [103]:
humor = pd.read_csv('./datasets/hsq_data.csv')

## Exploratory Data Analysis

In [104]:
humor.shape

(1071, 39)

In [105]:
humor.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q30,Q31,Q32,affiliative,selfenhancing,agressive,selfdefeating,age,gender,accuracy
0,2,2,3,1,4,5,4,3,4,3,...,4,2,2,4.0,3.5,3.0,2.3,25,2,100
1,2,3,2,2,4,4,4,3,4,3,...,4,3,1,3.3,3.5,3.3,2.4,44,2,90
2,3,4,3,3,4,4,3,1,2,4,...,5,4,2,3.9,3.9,3.1,2.3,50,1,75
3,3,3,3,4,3,5,4,3,-1,4,...,5,3,3,3.6,4.0,2.9,3.3,30,2,85
4,1,4,2,2,3,5,4,1,4,4,...,5,4,2,4.1,4.1,2.9,2.0,52,1,80


In [106]:
humor.isnull().sum().sum()

0

In [107]:
humor.dtypes.head()

Q1    int64
Q2    int64
Q3    int64
Q4    int64
Q5    int64
dtype: object

In [108]:
humor.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Q1,1071.0,2.02521,1.075782,-1.0,1.0,2.0,3.0,5.0
Q2,1071.0,3.34267,1.112898,-1.0,3.0,3.0,4.0,5.0
Q3,1071.0,3.078431,1.167877,-1.0,2.0,3.0,4.0,5.0
Q4,1071.0,2.8338,1.160252,-1.0,2.0,3.0,4.0,5.0
Q5,1071.0,3.59944,1.061281,-1.0,3.0,4.0,4.0,5.0


In [109]:
## Remonving the other/declined leaving Male/Female

humor = humor[humor.gender.isin([1,2])]

In [112]:
# Function to change -1's to Nan's

def neg_to_zero(df):
    return df.replace(-1, np.nan)

In [113]:
humor = neg_to_zero(humor)

In [115]:
humor.isna().sum().sum()

162

In [116]:
humor.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,Q30,Q31,Q32,affiliative,selfenhancing,agressive,selfdefeating,age,gender,accuracy
0,2.0,2.0,3.0,1.0,4.0,5.0,4.0,3.0,4.0,3.0,...,4.0,2.0,2.0,4.0,3.5,3.0,2.3,25,2,100
1,2.0,3.0,2.0,2.0,4.0,4.0,4.0,3.0,4.0,3.0,...,4.0,3.0,1.0,3.3,3.5,3.3,2.4,44,2,90
2,3.0,4.0,3.0,3.0,4.0,4.0,3.0,1.0,2.0,4.0,...,5.0,4.0,2.0,3.9,3.9,3.1,2.3,50,1,75
3,3.0,3.0,3.0,4.0,3.0,5.0,4.0,3.0,,4.0,...,5.0,3.0,3.0,3.6,4.0,2.9,3.3,30,2,85
4,1.0,4.0,2.0,2.0,3.0,5.0,4.0,1.0,4.0,4.0,...,5.0,4.0,2.0,4.1,4.1,2.9,2.0,52,1,80


In [117]:
humor.dropna(inplace=True)

In [119]:
humor.shape

(980, 39)

In [120]:
humor.isna().sum().sum()

0

## Model Training

In [121]:
# Setting my X and y's

X = humor.drop(columns='gender')
y = humor.gender

In [122]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

In [132]:
lr = LogisticRegression()

In [150]:
lr_score = cross_val_score(lr, Xs, y, cv=10)

print(lr_score)
print(np.mean(lr_score))

[0.57575758 0.65656566 0.56565657 0.58163265 0.53061224 0.60204082
 0.56122449 0.59793814 0.60824742 0.54639175]
0.5826067321649059


In [148]:
baseline = y.value_counts(normalize=True)[1]
baseline

0.5479591836734694

In [153]:
# Setting up a test train split with test size at 50%

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [155]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [156]:
yhat = lr.predict(X_test)
yhat_pp = lr.predict_proba(X_test)

In [157]:
print(yhat)
print(yhat_pp)

[2 1 1 2 1 2 1 1 2 2 1 1 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1
 1 2 1 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 1 2 1 2 1 1 2 2 2 2 2 2
 1 2 1 1 1 1 1 1 2 1 2 2 1 1 1 2 1 2 2 1 2 1 2 1 1 1 2 2 2 1 1 1 2 2 1 1 2
 2 2 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 1 1 1 2 1 2 1 2 2 1
 1 2 1 1 2 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 1 2 1 1 1 2 2 1 1 1 2 2 1 2 1 1
 1 1 1 1 2 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 1 1 1 2 1 2
 1 1 1 1 1 2 1 1 1 1 2 2 1 2 1 1 2 1 1 1 2 2 1 2 1 2 1 1 1 2 2 2 1 1 1 1 1
 1 1 1 1 1 1 1 2 1 1 1 2 1 1 2 2 1 2 1 2 1 2 2 2 2 1 1 2 1 2 2 2 1 1 1 1 1
 1 1 1 2 1 2 1 1 1 1 2 2 2 2 1 1 2 1 2 1 1 1 1 2 1 1 1 1 1 1 2 1 1 2 1 1 1
 2 2 1 1 2 2 2 1 1 1 2 2 1 2 2 1 1 1 1 2 1 1 1 1 2 1 2 1 1 2 1 2 2 1 1 1 2
 1 2 2 2 2 1 1 1 1 2 1 2 2 1 2 2 1 2 1 2 2 1 1 1 2 2 1 1 2 2 1 1 1 1 2 1 1
 1 1 2 1 1 1 1 2 1 1 1 1 2 2 1 2 1 2 1 1 2 1 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 2 2 1 1 2 2 1 1 1
 1 1 2 2 2 2 2 2 2]
[[4.5

In [158]:
tp = np.sum((y_test == 1) & (yhat == 1))
fp = np.sum((y_test == 0) & (yhat == 1))
tn = np.sum((y_test == 0) & (yhat == 0))
fn = np.sum((y_test == 1) & (yhat == 0))
print(tp, fp, tn, fn)

194 0 0 0
