In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report , accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler , OrdinalEncoder
import numpy as np

In [None]:
from google.colab import files
upload = files.upload()

Saving AttendeeReport_Cleaned.csv to AttendeeReport_Cleaned.csv


In [None]:
import io
df = pd.read_csv(io.BytesIO(upload['AttendeeReport_Cleaned.csv']))

In [None]:
df.shape

(3667, 23)

In [None]:
df.describe

In [None]:
print(df.dtypes)


Attended                                                                                                                                                 object
User Name (Original Name)                                                                                                                                object
First Name                                                                                                                                               object
Last Name                                                                                                                                                object
City                                                                                                                                                     object
Country/Region                                                                                                                                           object
Zip/Postal Code                         

In [None]:
# Check the number of missing values#
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
#Updating Target column Source Name -> Source_name#
df = df.rename(columns={'Source Name': 'Source_name'})

In [None]:
#Dropping missing records from the Target Variable#
df.dropna(subset=['Source_name'], inplace=True)

In [None]:
## Lasso Regularization Model for FEATURE SELECTION#
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Separate the features and target variable
X = df.drop('Source_name', axis=1)
y = df['Source_name']

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define the preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Identify numerical and categorical columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Apply the preprocessing steps to the columns
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)])

# Create a Lasso model
lasso = Lasso(alpha=0.1)

# Create a pipeline combining preprocessing and Lasso model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lasso', lasso)])

# Fit the pipeline
pipeline.fit(X, y_encoded)

# Get the feature importance scores
importance_scores = np.abs(pipeline.named_steps['lasso'].coef_)

# Sort the features based on importance
sorted_indices = importance_scores.argsort()[::-1]

# Get the top-k feature indices
k = 5  # Number of top features to select
selected_indices = sorted_indices[:k]

# Get the names of the selected features
selected_features = np.array(pipeline.named_steps['preprocessor'].get_feature_names_out())[selected_indices]

# Print the selected feature names
print(selected_features)


['cat__I agree that I may be video recorded and acknowledge that I am aware that the recording will be available to the public._Yes'
 'num__Season' 'cat__Field_3D Printing in Hospitals' 'cat__Attended_No'
 'cat__Attended_Yes']


In [None]:
# Extract the selected features and target variable
selected_features = ['I agree that I may be video recorded and acknowledge that I am aware that the recording will be available to the public.',
                     'Season',
                     'Field',
                     'Attended']

# Create a new DataFrame with the selected features and target variable
data_selected = df[selected_features + ['Source_name']].copy()


**Multinomial Logistic Regression Model**



In [None]:
# Multinomial Logistic Regression Model #
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# Separate the features and target variable
X = data_selected[selected_features]
y = data_selected['Source_name']


# Separate the categorical features and numerical features
categorical_features = ['I agree that I may be video recorded and acknowledge that I am aware that the recording will be available to the public.', 'Attended' , 'Field']
numerical_features = ['Season']

# Perform one-hot encoding on the categorical features
categorical_transformer = OneHotEncoder(sparse=False)

# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline with the preprocessor and logistic regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', LogisticRegression(multi_class='multinomial', max_iter=1000))])




In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 ,stratify=y)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the target variable for the testing data
y_pred = model.predict(X_test)





In [None]:
# Evaluate the model's performance
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)

Accuracy: 0.5561797752808989
F1 score: 0.48336920032882785


**Random Forest - BK - Selected Features**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate the features and target variable
X = data_selected[selected_features]
y = data_selected['Source_name']

# Define the categorical features
categorical_features = ['I agree that I may be video recorded and acknowledge that I am aware that the recording will be available to the public.', 'Attended', 'Field']

# Perform one-hot encoding on the categorical features
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_features)])

# Create the Random Forest model
model = RandomForestClassifier(n_estimators=300, random_state=42)

# Create the pipeline with the preprocessor and Random Forest model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model on the training data
pipeline.fit(X_train, y_train)

# Predict the target variable for the testing data
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)


Accuracy: 0.5561797752808989
F1 score: 0.49699824579747


**Random Forest BK - All features**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score

# Separate the features and target variable
X = df.drop('Source_name', axis=1)
y = df['Source_name']

# Define the categorical features for one-hot encoding
categorical_features = X.select_dtypes(include='object').columns.tolist()

# Perform one-hot encoding on categorical features
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)],
    remainder='passthrough'
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the training data
X_train = preprocessor.fit_transform(X_train)

# Preprocess the testing data
X_test = preprocessor.transform(X_test)

# Create the Random Forest model
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Perform cross-validation
#cv_scores = cross_val_score(model, preprocessor.fit_transform(X), y, cv=5)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the target variable for the testing data
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)



Accuracy: 0.6067415730337079
F1 score: 0.5271432482576971


**DECISION TREE - NEW BK**

In [None]:
# Decision Tree model #
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, train_test_split

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Convert the input data to dense arrays
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Fit and transform the categorical features in the training data
X_train_encoded = encoder.fit_transform(X_train_dense)

# Transform the categorical features in the testing data
X_test_encoded = encoder.transform(X_test_dense)

# Create the Decision Tree model
model = DecisionTreeClassifier()

# Perform cross-validation and calculate accuracy scores
accuracy_scores = cross_val_score(model, X_train_encoded, y_train, cv=5, scoring='accuracy')

# Fit the model on the training data
model.fit(X_train_encoded, y_train)

# Predict the target variable for the testing data
y_pred = model.predict(X_test_encoded)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

# Print the accuracy scores from cross-validation
print("Cross-Validation Accuracy Scores:", accuracy_scores)




Accuracy: 0.547752808988764
F1 Score: 0.5216626424861378
Cross-Validation Accuracy Scores: [0.56140351 0.56315789 0.59578207 0.56942004 0.58347979]
