In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Importing the Data

In [365]:
df = pd.read_csv(r"D:\Study\BA\TEST\Students Adaptability Level in Online Education\students_adaptability_level_online_education.csv")

### Basic Check on data

In [366]:
df.head()

Unnamed: 0,Gender,Age,Education Level,Institution Type,IT Student,Location,Load-shedding,Financial Condition,Internet Type,Network Type,Class Duration,Self Lms,Device,Adaptivity Level
0,Boy,21-25,University,Non Government,No,Yes,Low,Mid,Wifi,4G,3-6,No,Tab,Moderate
1,Girl,21-25,University,Non Government,No,Yes,High,Mid,Mobile Data,4G,1-3,Yes,Mobile,Moderate
2,Girl,16-20,College,Government,No,Yes,Low,Mid,Wifi,4G,1-3,No,Mobile,Moderate
3,Girl,11-15,School,Non Government,No,Yes,Low,Mid,Mobile Data,4G,1-3,No,Mobile,Moderate
4,Girl,16-20,School,Non Government,No,Yes,Low,Poor,Mobile Data,3G,0,No,Mobile,Low


In [367]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Gender               1205 non-null   object
 1   Age                  1205 non-null   object
 2   Education Level      1205 non-null   object
 3   Institution Type     1205 non-null   object
 4   IT Student           1205 non-null   object
 5   Location             1205 non-null   object
 6   Load-shedding        1205 non-null   object
 7   Financial Condition  1205 non-null   object
 8   Internet Type        1205 non-null   object
 9   Network Type         1205 non-null   object
 10  Class Duration       1205 non-null   object
 11  Self Lms             1205 non-null   object
 12  Device               1205 non-null   object
 13  Adaptivity Level     1205 non-null   object
dtypes: object(14)
memory usage: 131.9+ KB


In [368]:
df.isna().sum()

Gender                 0
Age                    0
Education Level        0
Institution Type       0
IT Student             0
Location               0
Load-shedding          0
Financial Condition    0
Internet Type          0
Network Type           0
Class Duration         0
Self Lms               0
Device                 0
Adaptivity Level       0
dtype: int64

In [369]:
df.nunique()

Gender                 2
Age                    6
Education Level        3
Institution Type       2
IT Student             2
Location               2
Load-shedding          2
Financial Condition    3
Internet Type          2
Network Type           3
Class Duration         3
Self Lms               2
Device                 3
Adaptivity Level       3
dtype: int64

### Checking P-Value

In [370]:
from scipy.stats import chi2_contingency
for col in df.columns:
    if col not in ['weight','Adaptivity Level']:
        table = pd.crosstab(index=df[col], columns=df['Adaptivity Level'], margins=True)
        result = chi2_contingency(table)
        print(col,'p-value =',result[1].round(4))

Gender p-value = 0.0364
Age p-value = 0.0
Education Level p-value = 0.0
Institution Type p-value = 0.0
IT Student p-value = 0.0033
Location p-value = 0.0
Load-shedding p-value = 0.1258
Financial Condition p-value = 0.0
Internet Type p-value = 0.0018
Network Type p-value = 0.0004
Class Duration p-value = 0.0
Self Lms p-value = 0.0
Device p-value = 0.0


##### Load-shedding having a p-Value above 0.05

### Checking the Unique Values within the columns

In [371]:
df["Gender"].value_counts()

Boy     663
Girl    542
Name: Gender, dtype: int64

In [372]:
df["Age"].value_counts()

21-25    374
11-15    353
16-20    278
1-5       81
26-30     68
6-10      51
Name: Age, dtype: int64

In [373]:
df["Education Level"].value_counts()

School        530
University    456
College       219
Name: Education Level, dtype: int64

In [374]:
df["Institution Type"].value_counts()

Non Government    823
Government        382
Name: Institution Type, dtype: int64

In [375]:
df["IT Student"].value_counts()

No     901
Yes    304
Name: IT Student, dtype: int64

In [376]:
df["Location"].value_counts()

Yes    935
No     270
Name: Location, dtype: int64

In [377]:
df["Load-shedding"].value_counts()

Low     1004
High     201
Name: Load-shedding, dtype: int64

In [378]:
df["Financial Condition"].value_counts()

Mid     878
Poor    242
Rich     85
Name: Financial Condition, dtype: int64

In [379]:
df["Internet Type"].value_counts()

Mobile Data    695
Wifi           510
Name: Internet Type, dtype: int64

In [380]:
df["Network Type"].value_counts()

4G    775
3G    411
2G     19
Name: Network Type, dtype: int64

In [381]:
df["Class Duration"].value_counts()

1-3    840
3-6    211
0      154
Name: Class Duration, dtype: int64

In [382]:
df["Device"].value_counts()

Mobile      1013
Computer     162
Tab           30
Name: Device, dtype: int64

### Target Variable

In [383]:
df["Adaptivity Level"].value_counts()

Moderate    625
Low         480
High        100
Name: Adaptivity Level, dtype: int64

### Converting the Values where unique is less than 2 to dummy values (0 or 1)

In [384]:
df.nunique()<3

Gender                  True
Age                    False
Education Level        False
Institution Type        True
IT Student              True
Location                True
Load-shedding           True
Financial Condition    False
Internet Type           True
Network Type           False
Class Duration         False
Self Lms                True
Device                 False
Adaptivity Level       False
dtype: bool

In [385]:
df = pd.get_dummies(df,columns=["Gender","Institution Type","IT Student","Location","Load-shedding","Internet Type","Self Lms"],drop_first=True)

In [386]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Age                              1205 non-null   object
 1   Education Level                  1205 non-null   object
 2   Financial Condition              1205 non-null   object
 3   Network Type                     1205 non-null   object
 4   Class Duration                   1205 non-null   object
 5   Device                           1205 non-null   object
 6   Adaptivity Level                 1205 non-null   object
 7   Gender_Girl                      1205 non-null   uint8 
 8   Institution Type_Non Government  1205 non-null   uint8 
 9   IT Student_Yes                   1205 non-null   uint8 
 10  Location_Yes                     1205 non-null   uint8 
 11  Load-shedding_Low                1205 non-null   uint8 
 12  Internet Type_Wifi               1

### Using LabelEncoder where the value is >3

In [387]:
df.select_dtypes(include="object").columns

Index(['Age', 'Education Level', 'Financial Condition', 'Network Type',
       'Class Duration', 'Device', 'Adaptivity Level'],
      dtype='object')

In [388]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Age'] = le.fit_transform(df['Age'])
df['Education Level'] = le.fit_transform(df['Education Level'])
df['Financial Condition'] = le.fit_transform(df['Financial Condition'])
df['Network Type'] = le.fit_transform(df['Network Type'])
df['Class Duration'] = le.fit_transform(df['Class Duration'])
df['Device'] = le.fit_transform(df['Device'])
df['Adaptivity Level'] = le.fit_transform(df['Adaptivity Level'])

In [390]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1205 entries, 0 to 1204
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype
---  ------                           --------------  -----
 0   Age                              1205 non-null   int32
 1   Education Level                  1205 non-null   int32
 2   Financial Condition              1205 non-null   int32
 3   Network Type                     1205 non-null   int32
 4   Class Duration                   1205 non-null   int32
 5   Device                           1205 non-null   int32
 6   Adaptivity Level                 1205 non-null   int32
 7   Gender_Girl                      1205 non-null   uint8
 8   Institution Type_Non Government  1205 non-null   uint8
 9   IT Student_Yes                   1205 non-null   uint8
 10  Location_Yes                     1205 non-null   uint8
 11  Load-shedding_Low                1205 non-null   uint8
 12  Internet Type_Wifi               1205 non-null  

In [391]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [392]:
y = df['Adaptivity Level']
x = df.drop(['Adaptivity Level','Load-shedding_Low'],axis=1)

In [393]:
x.shape,y.shape

((1205, 12), (1205,))

In [394]:
y.value_counts()

2    625
1    480
0    100
Name: Adaptivity Level, dtype: int64

In [395]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

In [396]:
Rtree = RandomForestClassifier(n_estimators=1000)

In [397]:
Rtree.fit(x_train,y_train)

RandomForestClassifier(n_estimators=1000)

In [398]:
prediction = Rtree.predict(x_test)

In [399]:
cm = confusion_matrix(y_test, prediction)

In [400]:
cm

array([[ 21,   0,   2],
       [  1,  86,   5],
       [  0,   8, 118]], dtype=int64)

In [402]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93        23
           1       0.91      0.93      0.92        92
           2       0.94      0.94      0.94       126

    accuracy                           0.93       241
   macro avg       0.94      0.93      0.93       241
weighted avg       0.93      0.93      0.93       241

