In [72]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

PATH = r"C:\Users\Wiki How\Downloads\acm_meeting_data.csv"

In [73]:
df = pd.read_csv(PATH)

In [74]:
df.head()

Unnamed: 0,Timestamp,student_id,first_sign_in,class_level,first_name,pref_name,last_name,discord_username,gender,ethnicity,yos,major,is_real_submission
0,7/12/2024 13:03,11403822,Yes,CSCI 411+,Ryan,,Makela,flyin.ryan,Male,White,Junior,Computer Science,1
1,7/12/2024 13:03,12119212,Yes,CSCI 411+,Victoria,,Worthington,_vica,Female,White,Senior,Computer Science,1
2,9/6/2024 15:48,11794901,Yes,CSCI 411+,Kamran,Koris,Haq,Koris,Nonbinary,Awa,Senior,Computer Science,1
3,9/6/2024 15:50,11479859,Yes,CSCI 211/equivalent,Adrian,,Moreno Talavera,jereezzy22,Male,Hispanic/Latino,Junior,Computer Engineering,1
4,9/6/2024 15:50,12614278,Yes,Just starting out/curious,Montana,Montana,Pawek,FlamingWombat,Male,Caucasian,Graduate,Computer Science,1


In [75]:
first_sign_in = df['first_sign_in'] == "Yes"
df = df[first_sign_in]

In [76]:
df = df.rename(columns={'Timestamp': 'timestamp'})

In [77]:
df.head()

Unnamed: 0,timestamp,student_id,first_sign_in,class_level,first_name,pref_name,last_name,discord_username,gender,ethnicity,yos,major,is_real_submission
0,7/12/2024 13:03,11403822,Yes,CSCI 411+,Ryan,,Makela,flyin.ryan,Male,White,Junior,Computer Science,1
1,7/12/2024 13:03,12119212,Yes,CSCI 411+,Victoria,,Worthington,_vica,Female,White,Senior,Computer Science,1
2,9/6/2024 15:48,11794901,Yes,CSCI 411+,Kamran,Koris,Haq,Koris,Nonbinary,Awa,Senior,Computer Science,1
3,9/6/2024 15:50,11479859,Yes,CSCI 211/equivalent,Adrian,,Moreno Talavera,jereezzy22,Male,Hispanic/Latino,Junior,Computer Engineering,1
4,9/6/2024 15:50,12614278,Yes,Just starting out/curious,Montana,Montana,Pawek,FlamingWombat,Male,Caucasian,Graduate,Computer Science,1


In [78]:
df['class_level'].unique()

array(['CSCI 411+', 'CSCI 211/equivalent', 'Just starting out/curious',
       'CSCI 111/equivalent', 'CSCI 311/equivalent'], dtype=object)

In [79]:
# Encode the data
df['timestamp'] = df['timestamp'].astype('datetime64[ms]')
df['day_of_week'] = df['timestamp'].dt.day_of_week

mapping = {'CSCI 411+' : 4, 
           'CSCI 211/equivalent' : 2, 
           'Just starting out/curious' : 0,
            'CSCI 111/equivalent' : 1,     
            'CSCI 311/equivalent' : 3}
df['class_level'] = df['class_level'].map(mapping)
df['pref_name'] = df['pref_name'].fillna(df['first_name'])

In [80]:
# Encoding text
def text_features(df, columns):
    for col in columns:
        df[f'{col}_is_lowercase'] = df[col].apply(lambda x: x.islower())
        df[f'{col}_lowercase_ratio'] = df[col].apply(lambda x: sum(1 for c in x if c.islower()) / len(x))
        df[f'{col}_uppercase_ratio'] = df[col].apply(lambda x: sum(1 for c in x if c.isupper()) / len(x))
        df[f'{col}_first_is_upper'] = df[col].apply(lambda x: x[0].isupper() if x else False)
    return df

# Apply this function to name columns
df = text_features(df, ['first_name', 'pref_name', 'last_name'])

In [81]:
df = df.drop(columns=['timestamp', 'first_sign_in', 'discord_username', 'first_name', 'pref_name', 'last_name'])

In [82]:
df.tail(10)

Unnamed: 0,student_id,class_level,gender,ethnicity,yos,major,is_real_submission,day_of_week,first_name_is_lowercase,first_name_lowercase_ratio,first_name_uppercase_ratio,first_name_first_is_upper,pref_name_is_lowercase,pref_name_lowercase_ratio,pref_name_uppercase_ratio,pref_name_first_is_upper,last_name_is_lowercase,last_name_lowercase_ratio,last_name_uppercase_ratio,last_name_first_is_upper
4672,25895966,3,Prefer not to answer,Nubians,Sophomore,Prefer not to answer,0,4,True,1.0,0.0,False,True,1.0,0.0,False,True,1.0,0.0,False
4673,26378931,2,Female,Haliwa-Saponi,Prefer not to answer,Computer Information Systems,0,4,True,1.0,0.0,False,True,1.0,0.0,False,True,1.0,0.0,False
4674,12151283,3,Female,Asian,Sophomore,Computer Science,1,0,False,0.6,0.2,True,False,0.555556,0.222222,True,False,0.833333,0.166667,True
4679,12341122,2,Male,Indian,Freshman,Computer Science,1,4,False,0.875,0.125,True,False,0.8,0.2,True,False,0.833333,0.166667,True
4680,12283116,0,,,,,1,4,False,0.8,0.2,True,False,0.8,0.2,True,False,0.8,0.2,True
4681,12473852,2,Male,Middle Eastern,Freshman,Computer Science,1,4,False,0.666667,0.333333,True,False,0.666667,0.333333,True,False,0.75,0.25,True
4682,11234835,0,Male,Wasian,Freshman,Prefer not to answer,1,4,False,0.75,0.25,True,False,0.75,0.25,True,False,0.875,0.125,True
4683,123456789,0,Nonbinary,white,Senior,psychology,1,4,True,1.0,0.0,False,True,1.0,0.0,False,True,1.0,0.0,False
4684,11794901,4,,,,,1,4,False,0.833333,0.166667,True,False,0.8,0.2,True,False,0.666667,0.333333,True
4685,10174516,4,,,,,1,4,False,0.769231,0.153846,True,False,0.75,0.25,True,False,0.875,0.125,True


In [83]:
df['is_real_submission'].value_counts()

is_real_submission
0    2331
1      51
Name: count, dtype: int64

In [84]:
# Remove NaN rows
df = df.dropna()

In [85]:
df['is_real_submission'].value_counts()

is_real_submission
0    2331
1      46
Name: count, dtype: int64

In [86]:
df.dtypes

student_id                      int64
class_level                     int64
gender                         object
ethnicity                      object
yos                            object
major                          object
is_real_submission              int64
day_of_week                     int32
first_name_is_lowercase          bool
first_name_lowercase_ratio    float64
first_name_uppercase_ratio    float64
first_name_first_is_upper        bool
pref_name_is_lowercase           bool
pref_name_lowercase_ratio     float64
pref_name_uppercase_ratio     float64
pref_name_first_is_upper         bool
last_name_is_lowercase           bool
last_name_lowercase_ratio     float64
last_name_uppercase_ratio     float64
last_name_first_is_upper         bool
dtype: object

In [87]:
# Encoding everything
from sklearn.preprocessing import LabelEncoder
import numpy as np

le = LabelEncoder()

for col in df.columns:
    if df[col].dtype != np.int64 and df[col].dtype != np.int32:
        le.fit(df[col])
        df[col] = le.transform(df[col])
        
df.head(5)
        

Unnamed: 0,student_id,class_level,gender,ethnicity,yos,major,is_real_submission,day_of_week,first_name_is_lowercase,first_name_lowercase_ratio,first_name_uppercase_ratio,first_name_first_is_upper,pref_name_is_lowercase,pref_name_lowercase_ratio,pref_name_uppercase_ratio,pref_name_first_is_upper,last_name_is_lowercase,last_name_lowercase_ratio,last_name_uppercase_ratio,last_name_first_is_upper
0,11403822,4,24,1020,2,2,1,4,0,3,7,1,0,4,8,1,0,6,8,1
1,12119212,4,15,1020,4,2,1,4,0,8,3,1,0,9,3,1,0,11,2,1
2,11794901,4,29,71,4,2,1,4,0,6,5,1,0,6,6,1,0,1,11,1
3,11479859,2,24,354,2,0,1,4,0,6,5,1,0,7,5,1,0,4,6,1
4,12614278,0,24,161,1,2,1,4,0,7,4,1,0,8,4,1,0,4,9,1


In [91]:
from sklearn import preprocessing
x = df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
normalized_df = pd.DataFrame(x_scaled, columns=df.columns)

In [93]:
normalized_df.tail(20)

Unnamed: 0,student_id,class_level,gender,ethnicity,yos,major,is_real_submission,day_of_week,first_name_is_lowercase,first_name_lowercase_ratio,first_name_uppercase_ratio,first_name_first_is_upper,pref_name_is_lowercase,pref_name_lowercase_ratio,pref_name_uppercase_ratio,pref_name_first_is_upper,last_name_is_lowercase,last_name_lowercase_ratio,last_name_uppercase_ratio,last_name_first_is_upper
2357,0.316633,0.0,0.483333,0.582326,0.4,0.8,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2358,0.115897,0.5,0.25,0.539535,0.6,0.8,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2359,0.807054,0.25,0.4,0.303256,0.0,0.8,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2360,0.275442,0.0,0.4,0.648372,0.0,0.4,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2361,0.37123,0.0,0.483333,0.264186,0.4,0.2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2362,0.068108,0.75,0.25,0.611163,0.4,0.2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2363,0.342813,1.0,0.25,0.907907,0.6,0.4,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2364,0.645412,1.0,0.4,0.617674,1.0,0.2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2365,0.123204,1.0,0.4,0.184186,0.2,0.2,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2366,0.308549,0.5,0.4,0.609302,0.4,0.4,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


Data should now be cleaned.

Now it's just a matter of selecting good samples for the machine learning algorithm. 

In [94]:
df['is_real_submission'].value_counts() 

is_real_submission
0    2331
1      46
Name: count, dtype: int64

In [98]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

y = df['is_real_submission']
X = df.drop(columns=['is_real_submission'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

iso_forest = IsolationForest(contamination=0.02, random_state=42)
iso_forest.fit(X_train, y_train)

In [99]:
iso_forest.score()

AttributeError: 'IsolationForest' object has no attribute 'score'