# Classification

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning, UndefinedMetricWarning
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)


In [3]:
df = pd.read_csv('psyco.csv')

In [4]:
missing_counts = df.isnull().sum()

# Display the counts of missing values for each column
print(missing_counts)


age                 0
gender              0
occupation          0
line_of_work      696
time_bp             0
time_dp             0
travel_time         0
easeof_online       0
home_env            0
prod_inc            0
sleep_bal           0
new_skill           0
fam_connect         0
relaxed             0
self_time           0
like_hw             0
dislike_hw          0
prefer              0
certaindays_hw      0
dtype: int64


In [5]:
df.head()

Unnamed: 0,age,gender,occupation,line_of_work,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,like_hw,dislike_hw,prefer,certaindays_hw
0,19-25,Male,Student in College,,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,100,1,Complete Physical Attendance,Yes
1,Dec-18,Male,Student in School,,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,1111,1110,Complete Physical Attendance,No
2,19-25,Male,Student in College,,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,1100,111,Complete Physical Attendance,Yes
3,19-25,Male,Student in College,,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,100,1111,Complete Physical Attendance,Yes
4,19-25,Female,Student in College,,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,1010,1000,Complete Physical Attendance,Yes


In [6]:
#Replacing numerical values with categorical values
age_grp= {
   'Dec-18': 0,
   '19-25' : 1, 
   '26-32' : 2,
   '33-40' : 3,
   '40-50' : 4, 
   '50-60' : 5, 
   '60+'  : 6
}
df['Age Group'] = df['age'].replace(age_grp,inplace=True)

gender_grp = {
   'Male': 0,
   'Female' : 1,
   'Prefer not to say' : 2
}
df['Gender Group'] =df['gender'].replace(gender_grp,inplace=True)

occupation_grp = {
    'Working Professional' : 0,
    'Student in College' : 1,
    'Entrepreneur' : 2,
    'Homemaker' : 3,
    'Medical Professional aiding efforts against COVID-19' : 4,
    'Currently Out of Work' : 5 , 
    'Student in School' : 6,
    'Retired/Senior Citizen' : 7
}
df['Occupation Group'] = df['occupation'].replace(occupation_grp,inplace=True)

line_of_work_grp = {
    'Teaching' : 0,
    'Engineering' : 1,
    'Management' : 2,
    'Other'  : 3, 
    'Government Employee' :  4, 
    'Architect' : 5, 
    'APSPDCL ' : 6, 
    'Architecture' : 7  
}
df['Line of Work Group'] = df['line_of_work'].replace(line_of_work_grp,inplace=True)

prefer_grp = {
    'Complete Physical Attendance' : 0,
    'Work/study from home' : 1

}
df['Prefer Group'] = df['prefer'].replace(prefer_grp,inplace=True)

certaindays_hw_grp = {
    'Yes' : 0,
    'No' : 1,
    'Maybe' : 2
}
df['Certain days Group'] = df['certaindays_hw'].replace(certaindays_hw_grp,inplace=True)


In [7]:
#drop columns
columns_to_drop = ['Age Group', 'Gender Group', 'Occupation Group', 'Line of Work Group', 'Prefer Group', 'Certain days Group','line_of_work',"like_hw", "dislike_hw"]
df.drop(columns=columns_to_drop, inplace=True)


In [8]:
df.head()

Unnamed: 0,age,gender,occupation,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer,certaindays_hw
0,1,0,1,7,5,0.5,3,3,0.0,0.0,0.5,1.0,-0.5,-0.5,0,0
1,0,0,6,7,11,0.5,4,2,-0.5,0.5,-1.0,1.0,1.0,1.0,0,1
2,1,0,1,7,7,1.5,2,2,1.0,0.0,0.5,0.5,0.5,0.5,0,0
3,1,0,1,7,7,1.5,3,1,0.0,1.0,0.5,0.0,-1.0,-0.5,0,0
4,1,1,1,7,7,1.5,2,2,0.0,0.0,0.0,0.0,0.5,0.0,0,0


In [9]:
# Define categorical columns
categorical_columns = ["prod_inc", "sleep_bal", "new_skill", "fam_connect", "relaxed", "self_time"]
                     

# Initialize LabelEncoder
LE = LabelEncoder()

# Apply label encoding to each categorical column
df[categorical_columns] = df[categorical_columns].apply(lambda col: LE.fit_transform(col))

In [10]:
missing_counts = df.isnull().sum()

# Display the counts of missing values for each column
print(missing_counts)


age               0
gender            0
occupation        0
time_bp           0
time_dp           0
travel_time       0
easeof_online     0
home_env          0
prod_inc          0
sleep_bal         0
new_skill         0
fam_connect       0
relaxed           0
self_time         0
prefer            0
certaindays_hw    0
dtype: int64


In [11]:
df.head()

Unnamed: 0,age,gender,occupation,time_bp,time_dp,travel_time,easeof_online,home_env,prod_inc,sleep_bal,new_skill,fam_connect,relaxed,self_time,prefer,certaindays_hw
0,1,0,1,7,5,0.5,3,3,2,2,3,4,1,1,0,0
1,0,0,6,7,11,0.5,4,2,1,3,0,4,4,4,0,1
2,1,0,1,7,7,1.5,2,2,4,2,3,3,3,3,0,0
3,1,0,1,7,7,1.5,3,1,2,4,3,2,0,1,0,0
4,1,1,1,7,7,1.5,2,2,2,2,2,2,3,2,0,0


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [13]:
# Split the dataset into training and testing sets
X = df.drop("gender", axis=1)
y = df["gender"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features if necessary
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Convert integer to strings type
class_labels = y.unique().astype(str)

In [17]:
# Logistic Regression
logistic_model = LogisticRegression(random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

# Printing the matrixes
print("Logistic Regression")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic), '\n')

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_logistic))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_logistic))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_logistic)), '\n')

print("Classification Report:")
print(classification_report(y_test.astype(str),y_pred_logistic.astype(str), target_names=class_labels, output_dict=False))
print("Accuracy: ",accuracy_score(y_test, y_pred_logistic))
print("AUC:", roc_auc_score(y_test, logistic_model.predict_proba(X_test), multi_class='ovr'))

Logistic Regression
Confusion Matrix:
[[180  19   0]
 [ 50 101   0]
 [  2   1   0]] 

Mean Absolute Error: 0.2096317280453258
Mean Squared Error: 0.22096317280453256
Root Mean Squared Error: 0.4700672003070758 

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.90      0.84       199
           1       0.83      0.67      0.74       151
           2       0.00      0.00      0.00         3

    accuracy                           0.80       353
   macro avg       0.54      0.52      0.53       353
weighted avg       0.79      0.80      0.79       353

Accuracy:  0.7960339943342776
AUC: 0.8598180399376135


In [15]:
#Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt), '\n')

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_dt))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_dt))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_dt)), '\n')

print("Classification Report:")
print(classification_report(y_test.astype(str),y_pred_dt.astype(str), target_names=class_labels, output_dict=False))
print("Accuracy: ",accuracy_score(y_test, y_pred_dt))
print("AUC:", roc_auc_score(y_test, dt_model.predict_proba(X_test), multi_class='ovr'))

Decision Tree
Confusion Matrix:
[[178  20   1]
 [ 23 128   0]
 [  2   1   0]] 

Mean Absolute Error: 0.141643059490085
Mean Squared Error: 0.15864022662889518
Root Mean Squared Error: 0.39829665656253654 

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       199
           1       0.86      0.85      0.85       151
           2       0.00      0.00      0.00         3

    accuracy                           0.87       353
   macro avg       0.58      0.58      0.58       353
weighted avg       0.86      0.87      0.86       353

Accuracy:  0.8668555240793201
AUC: 0.7454998799633223


In [16]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
# Printing the matrixes
print("Random Forest")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf), '\n')

print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_rf))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_rf))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_rf)), '\n')

print("Classification Report:")
print(classification_report(y_test.astype(str),y_pred_rf.astype(str), target_names=class_labels, output_dict=False))
print("Accuracy: ",accuracy_score(y_test, y_pred_rf))
print("AUC:", roc_auc_score(y_test, rf_model.predict_proba(X_test), multi_class='ovr'))

Random Forest
Confusion Matrix:
[[198   1   0]
 [ 32 119   0]
 [  3   0   0]] 

Mean Absolute Error: 0.11048158640226628
Mean Squared Error: 0.1274787535410765
Root Mean Squared Error: 0.35704166919433433 

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       199
           1       0.99      0.79      0.88       151
           2       0.00      0.00      0.00         3

    accuracy                           0.90       353
   macro avg       0.61      0.59      0.60       353
weighted avg       0.90      0.90      0.89       353

Accuracy:  0.8980169971671388
AUC: 0.892950943492442


In [14]:
# K-Nearest Neighbors
KNN = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
KNN.fit(X_train, y_train)

y_pred = KNN.predict(X_test)

# Printing the matrixes
print("K-Nearest Neighbors")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred), '\n')

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)), '\n')

print("Classification Report:")
print(classification_report(y_test.astype(str),y_pred.astype(str), target_names=class_labels, output_dict=False))
print("Accuracy:",accuracy_score(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, KNN.predict_proba(X_test), multi_class='ovr'))

K-Nearest Neighbors
Confusion Matrix:
[[179  20   0]
 [ 30 121   0]
 [  2   1   0]] 

Mean Absolute Error: 0.1558073654390935
Mean Squared Error: 0.1671388101983003
Root Mean Squared Error: 0.4088261368825387 

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.87       199
           1       0.85      0.80      0.83       151
           2       0.00      0.00      0.00         3

    accuracy                           0.85       353
   macro avg       0.57      0.57      0.57       353
weighted avg       0.84      0.85      0.85       353

Accuracy: 0.8498583569405099
AUC: 0.7686418762488771


In [20]:
models = {
    "KNeighbors Classifier": KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(random_state=42)
}

for model_name, model in models.items():
    scores = cross_val_score(model, X_test, y_test, cv=2, scoring='accuracy')
    
    # Get predicted values during cross-validation
    y_pred = cross_val_predict(model, X_test, y_test, cv=2, method='predict')
    
    print(f"{model_name} Cross Validation:")
    print(classification_report(y_test, y_pred, target_names=class_labels))
    print(f"Cross-validated Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f}) \n")

KNeighbors Classifier Cross Validation:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       199
           1       0.77      0.77      0.77       151
           2       0.00      0.00      0.00         3

    accuracy                           0.80       353
   macro avg       0.53      0.54      0.53       353
weighted avg       0.79      0.80      0.80       353

Cross-validated Accuracy: 0.80 (+/- 0.02) 

Random Forest Cross Validation:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       199
           1       0.88      0.83      0.86       151
           2       0.00      0.00      0.00         3

    accuracy                           0.88       353
   macro avg       0.58      0.58      0.58       353
weighted avg       0.87      0.88      0.87       353

Cross-validated Accuracy: 0.88 (+/- 0.03) 

Decision Tree Cross Validation:
              precision    recall  f1-score   