# **Machine Learning - BCSE209L**

### Shravan Venkatraman - 21BCE1200

### Anirudh Vinodh - 21BCE1194

### Shriyans A - 21BCE1121

## **Import Dependencies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier



## **Data Exploration**

In [None]:
# Read the data
df = pd.read_csv("/kaggle/input/jeopardata/jeopardata.csv")
df2 = pd.read_csv("/kaggle/input/jeopardata/jeopardata.csv")
df.head()

In [None]:
# Get the size of the dataset
df.shape

In [None]:
# Get statistical information regarding the dataset
df.describe()

In [None]:
# Check the unique value counts of all categorical columns 
for COLUMN in df.columns:
    if df[COLUMN].dtype == "object":
        print(f"{COLUMN}: {len(df[COLUMN].unique())}")

In [None]:
# Check columns and their datatypes
for COLUMN in df.columns:
    print(f"{COLUMN}: {df[COLUMN].dtype}")

In [None]:
# Check for NULL values
df.isnull().sum()

## **Exploratory Data Analysis**

In [None]:
report = ProfileReport(df2)
report.to_notebook_iframe()

In [None]:
# Group by state and calculate total correct and incorrect answers in Round One
df2_state_grouped = df2.groupby('Home State')[['Round One Correct Answers', 'Round One Incorrect Answers']].sum()

# Calculate the number of rows and columns for subplots
num_states = len(df2_state_grouped)
num_cols = 3  # Number of columns for subplots
num_rows = (num_states + num_cols - 1) // num_cols  # Calculate number of rows

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

# Flatten axes if necessary
if num_rows == 1:
    axes = axes.reshape(1, -1)
elif num_cols == 1:
    axes = axes.reshape(-1, 1)

# Define colors for the bars
colors = ['#1f77b4', '#ff7f0e']

# Iterate over each state and create a separate subplot
for i, (state, data) in enumerate(df2_state_grouped.iterrows()):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    data.plot(kind='bar', stacked=True, ax=ax, color=colors)  # Plot the data for the current state
    ax.set_title(f'Distribution of Correct and Incorrect Answers in Round One for {state}')
    ax.set_xlabel('State')
    ax.set_ylabel('Number of Answers')
    ax.tick_params(axis='x', rotation=45)  # Rotate x-labels

# Hide empty subplots
for i in range(num_states, num_rows * num_cols):
    row = i // num_cols
    col = i % num_cols
    fig.delaxes(axes[row, col])

plt.tight_layout()  # Adjust layout of the entire figure
plt.show()  # Show all subplots


In [None]:
import plotly.express as px
df2_state_grouped = df2.groupby('Home State')[['Round One Score', 'Round Two Score', 'Final Jeopardy Score']].mean().reset_index()
df2_state_melted = df2_state_grouped.melt(id_vars='Home State', var_name='Round', value_name='Total Game Score')
fig = px.area(df2_state_melted, x='Home State', y='Total Game Score', color='Round',
              title='Evolution of Total Game Score for Each State',
              labels={'Home State': 'State', 'Total Game Score': 'Total Game Score'})
fig.update_layout(xaxis=dict(tickangle=45), yaxis=dict(title='Total Game Score'), legend_title='Round')
fig.show()


In [None]:
import pandas as pd
import plotly.express as px

# Assuming df2 contains the provided table data
# Grouping by home state and calculating mean Round One Correct Answer Percentage
df2_state_mean = df2.groupby('Home State')['Round One Correct Answer Percentage'].mean().reset_index()

# Plotting an interactive bar chart
fig = px.bar(df2_state_mean, x='Home State', y='Round One Correct Answer Percentage',
             title='Average Round One Correct Answer Percentage by Home State',
             labels={'Round One Correct Answer Percentage': 'Mean Correct Answer Percentage'})
fig.show()


In [None]:
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
def show_city_stats(home_state):
    cities = df2[df2['Home State'] == home_state]['Home City'].unique()
    city_dropdown = widgets.Dropdown(options=cities, description='Select City:')
    
    def show_stats(home_city):
        stats = df2[(df2['Home State'] == home_state) & (df2['Home City'] == home_city)].iloc[0]
        stats = stats[['Round One Attempts', 'Round One Buzzes', 'Round One Buzz Percentage', 
                       'Round One Correct Answers', 'Round One Incorrect Answers', 
                       'Round One Correct Answer Percentage', 'Round One Daily Doubles', 
                       'Round One Score', 'Round Two Attempts', 'Round Two Buzzes', 
                       'Round Two Buzz Percentage', 'Round Two Correct Answers', 
                       'Round Two Incorrect Answers', 'Round Two Correct Answer Percentage', 
                       'Round Two Daily Double 1', 'Round Two Daily Double 2', 'Round Two Score', 
                       'Final Jeopardy Starting Score', 'Final Jeopardy Wager', 
                       'Final Jeopardy Score', 'Total Game Attempts', 'Total Game Buzzes', 
                       'Total Game Buzz Percentage', 'Total Game Correct Answers', 
                       'Total Game Incorrect Answers', 'Total Game Correct Answer Percentage', 
                       'Total Game Daily Doubles Correct', 'Total Game Daily Doubles Incorrect', 
                       'Total Game Daily Double Winnings', 'Total Game Score']]
        
        plt.figure(figsize=(20, 18))
        stats.plot(kind='bar')
        bar_color = 'red'
        plt.title(f'Statistics for {home_city}, {home_state}')
        plt.xlabel('Categories')
        plt.ylabel('Values')
        plt.xticks(rotation=45)
        plt.show()
    
    widgets.interact(show_stats, home_city=city_dropdown)
state_dropdown = widgets.Dropdown(options=df2['Home State'].unique(), description='Select State:')
widgets.interact(show_city_stats, home_state=state_dropdown)


In [None]:
plt.figure(figsize=(20, 12))

c = 1
for COLUMN in df.columns:
    if df[COLUMN].dtype != "object":
        try:
            plt.subplot(4, 8, c)
            plt.hist(df[COLUMN], bins=20, color='#004C99')
            c += 1
        except:
            pass
        
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Round One Score'], bins=20, kde=True, color='skyblue', alpha=0.7)
plt.title('Distribution of Round One Score')
plt.xlabel('Round One Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.legend(['Round One Score'])
plt.show()


In [None]:
pd.set_option('display.max_columns', None)
df.head()

In [None]:
plt.figure(figsize=(10, 6))
winner_counts = df['Is Winner'].value_counts()
winner_counts.plot(kind='bar', color=['lightcoral', 'lightgreen'], alpha=0.7)
plt.title('Contestant Winner Status')
plt.xlabel('Winner Status')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y')
plt.legend(['Not Winner', 'Winner'])
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Round One Buzz Percentage', y='Round One Score', data=df, color='orange', alpha=0.7)
plt.title('Round One Buzz Percentage vs. Round One Score')
plt.xlabel('Round One Buzz Percentage')
plt.ylabel('Round One Score')
plt.grid(True)
plt.legend(['Scores'])
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
plt.plot(df.index, df['Total Game Score'], marker='o', color='purple', linestyle='-', markersize=5, linewidth=2)
plt.title('Total Game Score Over Episodes')
plt.xlabel('Episode Number')
plt.ylabel('Total Game Score')
plt.grid(True)
plt.legend(['Total Game Score'])
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(y='Round One Buzz Percentage', data=df, color='lightpink', linewidth=2)
plt.title('Violin plot of Round One Buzz Percentage')
plt.ylabel('Round One Buzz Percentage')
plt.grid(axis='y')
plt.legend(['Buzz Percentage'])
plt.show()


In [None]:
pivot_df = df.pivot_table(index='Episode Number', columns='Is Winner', values='Total Game Score')

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pivot_df, cmap='viridis', annot=True, fmt=".0f", linewidths=.5)

plt.xlabel("Winner Status", fontsize=12)
plt.ylabel("Episode Number", fontsize=12)
plt.yticks(rotation=0)  # Rotate y-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Round One Buzz Percentage', y='Final Jeopardy Wager', hue='Is Winner')
plt.title('Correlation between Round One Buzz Percentage and Final Jeopardy Wager for Winners and Non-winners')
plt.xlabel('Round One Buzz Percentage')
plt.ylabel('Final Jeopardy Wager')
plt.legend(title='Is Winner', loc='upper right')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
df2_grouped = df2.groupby('Home State')['Total Game Score'].sum().reset_index()
plt.bar(df2_grouped['Home State'], df2_grouped['Total Game Score'])
plt.xlabel('Home State')
plt.ylabel('Total Game Score')
plt.title('Total Game Score for Each Home State')
plt.xticks(rotation=45, ha='right')  # Adjusting alignment to prevent overlapping
plt.tight_layout()  # Ensuring proper spacing
plt.show()

In [None]:
colors = plt.cm.tab20c.colors

plt.figure(figsize=(20, 20))  # Adjust the figure size as needed
df2_grouped = df2.groupby('Home State')['Round Two Correct Answer Percentage'].mean().reset_index()
plt.pie(df2_grouped['Round Two Correct Answer Percentage'], labels=df2_grouped['Home State'], autopct='%1.1f%%', colors=colors)
plt.title('Distribution of Correct Answer Percentages in Round Two by Home State')
plt.show()

In [None]:
contestant_counts = df2['Home State'].value_counts()
bar_color = 'green'
plt.figure(figsize=(16, 10))
contestant_counts.sort_values().plot(kind='barh', color=bar_color)
plt.title('Number of Contestants from Each Home State')
plt.xlabel('Number of Contestants')
plt.ylabel('Home State')
plt.grid(axis='x')
plt.show()

In [None]:
state_counts = df2['Home State'].value_counts(normalize=True) * 100

# Plotting
plt.figure(figsize=(12, 6))
state_counts.plot(kind='bar', color='skyblue')
plt.title('Percentage of Contestants from Each Home State')
plt.xlabel('Home State')
plt.ylabel('Percentage of Contestants')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Extract first names from contestant names
df2['Contestant First Name'] = df2['Contestant First Name'].apply(lambda x: x.split()[0])

# Find the top 10 highest scorers for each round
top_10_round_one = df2.nlargest(10, 'Round One Score')
top_10_round_two = df2.nlargest(10, 'Round Two Score')
final_10_scores = df2.nlargest(10, 'Total Game Score')

# Create subplots
fig, axs = plt.subplots(3, 1, figsize=(10, 15))

# Plot top 10 highest scorers for Round One
axs[0].bar(top_10_round_one['Contestant First Name'], top_10_round_one['Round One Score'], color='skyblue')
axs[0].set_title('Top 10 highest scorers for Round One')
axs[0].set_ylabel('Round One Score')
axs[0].set_xlabel('Contestant First Name')
axs[0].tick_params(axis='x', rotation=45)

# Plot top 10 highest scorers for Round Two
axs[1].bar(top_10_round_two['Contestant First Name'], top_10_round_two['Round Two Score'], color='lightgreen')
axs[1].set_title('Top 10 highest scorers for Round Two')
axs[1].set_ylabel('Round Two Score')
axs[1].set_xlabel('Contestant First Name')
axs[1].tick_params(axis='x', rotation=45)

# Plot final 10 highest scores
axs[2].bar(final_10_scores['Contestant First Name'], final_10_scores['Total Game Score'], color='lightcoral')
axs[2].set_title('Final 10 highest scores')
axs[2].set_ylabel('Total Game Score')
axs[2].set_xlabel('Contestant First Name')
axs[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
import plotly.graph_objects as go
import plotly.express as px
from ipywidgets import interact

# Define a function to create the interactive pie chart
def create_pie_chart(home_state):
    # Filter the DataFrame for the selected home state
    filtered_df = df2[df2['Home State'] == home_state]
    
    # Count the number of winners and non-winners
    winner_counts = filtered_df['Is Winner'].value_counts()
    
    # Create a pie chart
    fig = px.pie(names=winner_counts.index, values=winner_counts.values, title=f'Distribution of Winners in {home_state}',
                 labels={'index': 'Is Winner'})

    # Show the figure
    fig.show()

# Get unique home states
home_states = df2['Home State'].unique()

# Create an interactive widget for selecting the home state
interact(create_pie_chart, home_state=home_states)


In [None]:
episode_counts = df2['Episode Title'].value_counts()

# Plotting
plt.figure(figsize=(8, 8))
plt.pie(episode_counts, labels=episode_counts.index, autopct='%1.1f%%', colors=plt.cm.tab10.colors)
plt.title('Split of Different Episode Types')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.tight_layout()
plt.show()

In [None]:
winners = df[df['Is Winner'] == 1]
non_winners = df[df['Is Winner'] == 0]

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(non_winners['Round One Attempts'], non_winners['Round One Buzzes'], color='blue', label='Non-Winner')
plt.scatter(winners['Round One Attempts'], winners['Round One Buzzes'], color='red', label='Winner')
plt.ylabel('Round One Attempts')
plt.xlabel('Round One Buzzes')
plt.title('Relationship between Round One Attempts, Round One Buzzes, and Winner Status')
plt.legend()
plt.grid(True)
plt.show()

In [None]:


# Group by home state and count the number of winners in each state
winners_by_state = df2[df2['Is Winner'] == True].groupby('Home State')['Is Winner'].count().sort_values(ascending=False)

# Select the top few best performing home states
top_states = winners_by_state.head(10)

# Plot the top performing home states
plt.figure(figsize=(10, 6))
top_states.plot(kind='bar', color='skyblue')
plt.title('Top Performing Home States by Number of Winners')
plt.xlabel('Home State')
plt.ylabel('Number of Winners')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


## **Data Preprocessing**

In [None]:
# Drop duplicate columns
df = df.drop_duplicates()

In [None]:
# Drop columns which do not seem like factors to affect the output
df = df.drop(['Episode Date', 'Home State', 'Contestant Last Name', 'Contestant First Name', 'Home City'], axis=1)

In [None]:
# Plot correlation matrix to find out columns which are highly correlated
corr_matrix = df.drop(['Episode Title', 'Is Winner'], axis=1).corr().abs()

plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, cmap="YlGnBu")
plt.show()

In [None]:
# Perform one-hot encoding
one_hot_encoded = pd.get_dummies(df['Episode Title'], prefix='Title').astype(int)

# Concatenate the one-hot encoded columns with the original DataFrame and drop existing column
df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('Episode Title', axis=1)

In [None]:
# Label Encode the output column
le = preprocessing.LabelEncoder()
encoded_values = le.fit_transform(df['Is Winner'])
df['Is Winner'] = encoded_values

In [None]:
df.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


In [None]:
X = df.drop('Is Winner', axis=1)
y = df['Is Winner']

In [None]:
pca = PCA(n_components=25)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents )

In [None]:
X_pca = principalDf
y_pca = y

## **Train Vanilla Models**

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define a dictionary of models
models = {
    'Adaboost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(n_estimators=100),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(max_depth=4),
    'Extra Decision Tree': ExtraTreeClassifier(max_depth=4),
    'Naive Bayes': GaussianNB(),
    'Extra Trees': ExtraTreesClassifier(max_depth=4),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(
        hidden_layer_sizes=(200,),
        max_iter=300,
        early_stopping=True,
    )
}

In [None]:
# Fit and predict using each model
for name, model in models.items():
    print(f"Training model: {name}...")
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{name}: Accuracy - {score:.4f}\n')

## **Train Ensemble Models**

In [None]:
# Create the individual classifiers
xgbc = XGBClassifier(n_estimators=100)
rf = RandomForestClassifier(random_state=42)
svc = SVC()

# Create a voting classifier with 'hard' voting
voting_clf = VotingClassifier(estimators=[('xgbc', xgbc), ('rf', rf), ('svc', svc)], voting='hard')

# Fit the voting classifier on the training data
voting_clf.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f'Ensembling using Voting Classifier (XGBoost, Random Forest and Support Vector Machines): Accuracy - {score:.4f}\n')

## **Results**

In [None]:
y_pred=models['Random Forest'].predict(X_test)

In [None]:
def plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.xticks([0.5, 1.5], ["Not Winner", "Winner"])
    plt.yticks([0.5, 1.5], ["Not Winner", "Winner"])
    plt.show()

In [None]:
plot_confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=["Not Winner", "Winner"]))

## **PCA Training**

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [None]:
# Define a dictionary of models
models = {
    'Adaboost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(n_estimators=100),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(max_depth=4),
    'Extra Decision Tree': ExtraTreeClassifier(max_depth=4),
    'Naive Bayes': GaussianNB(),
    'Extra Trees': ExtraTreesClassifier(max_depth=4),
    'KNN': KNeighborsClassifier(),
    'MLP': MLPClassifier(
        hidden_layer_sizes=(200,),
        max_iter=300,
        early_stopping=True,
    )
}

In [None]:

# Fit and predict using each model
for name, model in models.items():
    print(f"Training model: {name}...")
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{name}: Accuracy - {score:.4f}\n')

In [None]:
# Create the individual classifiers
xgbc = XGBClassifier(n_estimators=100)
rf = RandomForestClassifier(random_state=42)
svc = SVC()

# Create a voting classifier with 'hard' voting
voting_clf = VotingClassifier(estimators=[('xgbc', xgbc), ('rf', rf), ('svc', svc)], voting='hard')

# Fit the voting classifier on the training data
voting_clf.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f'Ensembling using Voting Classifier (XGBoost, Random Forest and Support Vector Machines): Accuracy - {score:.4f}\n')

In [None]:
y_pred=models['Random Forest'].predict(X_test)

In [None]:
def plot_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.xticks([0.5, 1.5], ["Not Winner", "Winner"])
    plt.yticks([0.5, 1.5], ["Not Winner", "Winner"])
    plt.show()

In [None]:
plot_confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, target_names=["Not Winner", "Winner"]))