In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


In [3]:
df = pd.read_csv("/content/event_attendance_data_1000.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,registration_time_days_before,location_distance_km,event_type,past_attendance_count,reminder_sent,ticket_price,is_weekend_event,attended
0,56,Male,16,3.55,Tech Talk,8,1,73,0,1
1,46,Male,11,39.53,Seminar,5,0,536,1,1
2,32,Female,31,10.87,Seminar,8,0,100,0,1
3,25,Female,14,13.67,Workshop,1,1,175,0,0
4,38,Male,15,9.07,Seminar,2,0,426,1,0


In [5]:
df.tail()

Unnamed: 0,age,gender,registration_time_days_before,location_distance_km,event_type,past_attendance_count,reminder_sent,ticket_price,is_weekend_event,attended
995,41,Female,23,47.62,Tech Talk,6,1,492,0,0
996,32,Male,39,15.57,Seminar,5,0,922,0,0
997,46,Female,37,5.51,Workshop,1,1,127,0,0
998,25,Male,9,30.35,Cultural,8,0,1449,0,1
999,22,Female,34,31.56,Seminar,5,0,564,0,1


In [7]:
df.sample(5)

Unnamed: 0,age,gender,registration_time_days_before,location_distance_km,event_type,past_attendance_count,reminder_sent,ticket_price,is_weekend_event,attended
105,40,Female,39,16.93,Cultural,7,0,1157,0,1
898,27,Male,4,17.44,Workshop,6,0,769,0,0
989,27,Male,9,15.09,Workshop,6,0,180,0,1
859,48,Male,30,9.9,Seminar,3,1,687,1,0
590,45,Female,36,38.61,Seminar,6,1,150,1,1


In [9]:
df.describe()

Unnamed: 0,age,registration_time_days_before,location_distance_km,past_attendance_count,reminder_sent,ticket_price,is_weekend_event,attended
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,38.745,19.406,25.56482,4.604,0.484,755.686,0.496,0.631
std,12.186734,11.469905,14.195356,2.840995,0.499994,427.291267,0.500234,0.482775
min,18.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,9.0,14.2475,2.0,0.0,394.75,0.0,0.0
50%,40.0,20.0,25.485,5.0,0.0,754.0,0.0,1.0
75%,50.0,30.0,37.9825,7.0,1.0,1139.25,1.0,1.0
max,59.0,39.0,49.89,9.0,1.0,1492.0,1.0,1.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   age                            1000 non-null   int64  
 1   gender                         1000 non-null   object 
 2   registration_time_days_before  1000 non-null   int64  
 3   location_distance_km           1000 non-null   float64
 4   event_type                     1000 non-null   object 
 5   past_attendance_count          1000 non-null   int64  
 6   reminder_sent                  1000 non-null   int64  
 7   ticket_price                   1000 non-null   int64  
 8   is_weekend_event               1000 non-null   int64  
 9   attended                       1000 non-null   int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 78.3+ KB


In [12]:
df.shape

(1000, 10)

In [13]:
df.columns

Index(['age', 'gender', 'registration_time_days_before',
       'location_distance_km', 'event_type', 'past_attendance_count',
       'reminder_sent', 'ticket_price', 'is_weekend_event', 'attended'],
      dtype='object')

In [14]:
df.dtypes

Unnamed: 0,0
age,int64
gender,object
registration_time_days_before,int64
location_distance_km,float64
event_type,object
past_attendance_count,int64
reminder_sent,int64
ticket_price,int64
is_weekend_event,int64
attended,int64


In [16]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
registration_time_days_before,0
location_distance_km,0
event_type,0
past_attendance_count,0
reminder_sent,0
ticket_price,0
is_weekend_event,0
attended,0


In [18]:
df.duplicated().sum()

np.int64(0)

In [20]:
df.corr

In [22]:
df["attended"].unique()

array([1, 0])

In [24]:
df[(df['age'] < 0) | (df['age'] > 100)]

Unnamed: 0,age,gender,registration_time_days_before,location_distance_km,event_type,past_attendance_count,reminder_sent,ticket_price,is_weekend_event,attended


In [25]:
df_encoded = df.copy()

In [26]:
df.describe(include='object')

Unnamed: 0,gender,event_type
count,1000,1000
unique,2,4
top,Male,Tech Talk
freq,517,258


In [27]:
label_cols = ["gender", "event_type"]

In [28]:
le = LabelEncoder()
for col in label_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [30]:
df_encoded["gender"]

Unnamed: 0,gender
0,1
1,1
2,0
3,0
4,1
...,...
995,0
996,1
997,0
998,1


In [31]:
X = df_encoded.drop("attended", axis=1)

In [32]:
y = df_encoded["attended"]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)

In [42]:
rf_model.fit(X_train, y_train)

In [43]:
rf_pred = rf_model.predict(X_test)

In [44]:
rf_pred

array([0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1])

In [45]:
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

Random Forest Accuracy: 0.6666666666666666
[[ 53  62]
 [ 38 147]]
              precision    recall  f1-score   support

           0       0.58      0.46      0.51       115
           1       0.70      0.79      0.75       185

    accuracy                           0.67       300
   macro avg       0.64      0.63      0.63       300
weighted avg       0.66      0.67      0.66       300



In [46]:
import joblib

In [47]:
joblib.dump(rf_model, "attendance_model.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']