# Classification Project (First Glance)

Library

In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import klib

   

from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [109]:
df = pd.read_csv('looks_at.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224503 entries, 0 to 224502
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   person ID           224503 non-null  object
 1   country             224503 non-null  object
 2   age                 224503 non-null  int64 
 3   gender              224503 non-null  object
 4   education           224503 non-null  object
 5   visual acuity       224503 non-null  object
 6   reading speed       224503 non-null  object
 7   text density        224503 non-null  object
 8   font size           224503 non-null  object
 9   paper type          224503 non-null  object
 10  initial focus time  224503 non-null  object
 11  looks at            224503 non-null  object
dtypes: int64(1), object(11)
memory usage: 20.6+ MB


In [110]:
df.head()

Unnamed: 0,person ID,country,age,gender,education,visual acuity,reading speed,text density,font size,paper type,initial focus time,looks at
0,p0,USA,23,male,undergraduate,average,normal,medium,medium,report,normal,top center
1,p1,USA,27,female,high school,good,normal,medium,medium,report,fast,center
2,p2,USA,48,female,graduate,poor,slow,medium,medium,academic paper,normal,under left
3,p3,USA,32,male,graduate,good,normal,medium,medium,report,fast,center
4,p4,USA,25,female,graduate,good,normal,medium,big,report,normal,top left


In [111]:
df = df.drop(columns=['person ID', 'country'], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224503 entries, 0 to 224502
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   age                 224503 non-null  int64 
 1   gender              224503 non-null  object
 2   education           224503 non-null  object
 3   visual acuity       224503 non-null  object
 4   reading speed       224503 non-null  object
 5   text density        224503 non-null  object
 6   font size           224503 non-null  object
 7   paper type          224503 non-null  object
 8   initial focus time  224503 non-null  object
 9   looks at            224503 non-null  object
dtypes: int64(1), object(9)
memory usage: 17.1+ MB


In [112]:
target_column = 'looks at'

X = df.drop(target_column, axis=1)
y = df[target_column]

In [113]:
categorical_features = X.select_dtypes(exclude='number').columns
numerical_features = X.select_dtypes(include='number').columns


In [115]:
# Data Prep
numerical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [116]:
numerical_transformer

In [117]:
categorical_transformer=Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
    
])

In [118]:
categorical_transformer

In [119]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)

In [120]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestClassifier(n_estimators=100, random_state=42))])

In [98]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LogisticRegression(random_state=42))])

In [121]:
pipeline

In [130]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [131]:
x_train, x_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)


In [132]:
pipeline.fit(x_train, y_train)


In [133]:
y_pred = pipeline.predict(x_test)

In [137]:
y_pred

array([2, 1, 2, ..., 0, 2, 0], shape=(44901,))

In [140]:
class_report = classification_report(y_test, y_pred)

print(class_report)

              precision    recall  f1-score   support

           0       0.67      0.70      0.69     13370
           1       0.46      0.46      0.46     14364
           2       0.60      0.61      0.60     14889
           3       0.27      0.20      0.23       906
           4       0.20      0.10      0.14      1372

    accuracy                           0.56     44901
   macro avg       0.44      0.41      0.42     44901
weighted avg       0.56      0.56      0.56     44901



In [141]:
cm = confusion_matrix(y_test, y_pred)