In [1]:
# Import all the important liberaries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
import io
from google.colab import files

In [2]:
# upload the data
uploaded = files.upload()

Saving JEOPARDY_CSV.csv to JEOPARDY_CSV.csv


In [3]:
df = pd.read_csv(io.BytesIO(uploaded.get('JEOPARDY_CSV.csv')))
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
#Checking for null values
df.isnull().sum()

Show Number    0
 Air Date      0
 Round         0
 Category      0
 Value         0
 Question      0
 Answer        2
dtype: int64

In [5]:
# Checking for data type of each columns
df.dtypes

Show Number     int64
 Air Date      object
 Round         object
 Category      object
 Value         object
 Question      object
 Answer        object
dtype: object

In [6]:
#There are no null values but some values in the 'Value' column are filled with the string 'None'.
df[df[' Value'] == 'None'].count()

Show Number    3634
 Air Date      3634
 Round         3634
 Category      3634
 Value         3634
 Question      3634
 Answer        3634
dtype: int64

In [7]:
#Dropping rows containing 'None' Values.
df.drop(df[df[' Value'] == 'None'].index,inplace=True)

In [8]:
#The 'Value' column has the string of the value which also contains $ sign and columns, 
#so removing the signs and converting the string to interger value in a new column 'ValueNum'
df['ValueNum'] = df[' Value'].apply(
    lambda value: int(value.replace('$', '').replace(',','')))

In [9]:
#checking the number of unique values populated in ValueNum columns
df['ValueNum'].nunique()

145

In [10]:
# There are 145 unique values in the ValueNum column so it makes a lot of different categories to classify.
#Binning the values if the value is smaller than 1000, then we round to the nearest hundred. Otherwise, if it's between 1000 and 10k, 
#we round it to nearest thousand. If it's greater than 10k, then we round it to the nearest 10-thousand.
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

df['ValueBins'] = df['ValueNum'].apply(binning)

In [11]:
#So, Now we have 21 different values to classify instead of 145
df['ValueBins'].nunique()

21

## Building a Random Forest Model
Since the data is huge, but for our convenience lets take 10,000 random samples from the dataframe

In [12]:
df_sample = df.sample(n=10000)

In [13]:
# We will use a Random Forest Classifier model with Grid Searching for finding the best hyperparameters from our dictionary of parameters


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

RFC=RandomForestClassifier(max_features="sqrt")
parameters={ "max_depth":[5,8,25], 
             "min_samples_split":[1,2,5], "n_estimators":[800,1200]}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(RFC, parameters)


In [14]:
# Splitting the into dependent and independent features

X = df_sample[' Question']
y = df_sample['ValueBins']

In [15]:
tfidf = TfidfVectorizer(stop_words='english')

X = tfidf.fit_transform(X)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

In [17]:
clf.fit(X_train,y_train)
print(clf.cv_results_['params'])

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

ValueError: min_sam

[{'max_depth': 5, 'min_samples_split': 1, 'n_estimators': 800}, {'max_depth': 5, 'min_samples_split': 1, 'n_estimators': 1200}, {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 800}, {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 1200}, {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 800}, {'max_depth': 5, 'min_samples_split': 5, 'n_estimators': 1200}, {'max_depth': 8, 'min_samples_split': 1, 'n_estimators': 800}, {'max_depth': 8, 'min_samples_split': 1, 'n_estimators': 1200}, {'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 800}, {'max_depth': 8, 'min_samples_split': 2, 'n_estimators': 1200}, {'max_depth': 8, 'min_samples_split': 5, 'n_estimators': 800}, {'max_depth': 8, 'min_samples_split': 5, 'n_estimators': 1200}, {'max_depth': 25, 'min_samples_split': 1, 'n_estimators': 800}, {'max_depth': 25, 'min_samples_split': 1, 'n_estimators': 1200}, {'max_depth': 25, 'min_samples_split': 2, 'n_estimators': 800}, {'max_depth': 25, 'min_samples_split': 2, 'n

In [18]:
print(clf.cv_results_['rank_test_score'])

[18 16  7 12 10  6 17 13  7  9 10  5 14 15  2  1  2  2]


In [19]:
print(clf.cv_results_['params'][-2])

{'max_depth': 25, 'min_samples_split': 5, 'n_estimators': 800}


In [22]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         100       0.00      0.00      0.00       120
         200       0.50      0.00      0.01       459
         300       0.00      0.00      0.00       111
         400       0.19      0.97      0.32       559
         500       0.00      0.00      0.00       138
         600       0.00      0.00      0.00       283
         700       0.00      0.00      0.00         3
         800       0.00      0.00      0.00       464
        1000       0.15      0.03      0.04       460
        2000       0.16      0.02      0.03       381
        3000       0.00      0.00      0.00        11
        4000       0.00      0.00      0.00         4
        5000       0.00      0.00      0.00         4
        6000       0.00      0.00      0.00         2
        8000       0.00      0.00      0.00         1

    accuracy                           0.19      3000
   macro avg       0.07      0.07      0.03      3000
weighted avg       0.16   

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=200)
lr.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         100       0.00      0.00      0.00       120
         200       0.50      0.00      0.01       459
         300       0.00      0.00      0.00       111
         400       0.19      0.97      0.32       559
         500       0.00      0.00      0.00       138
         600       0.00      0.00      0.00       283
         700       0.00      0.00      0.00         3
         800       0.00      0.00      0.00       464
        1000       0.15      0.03      0.04       460
        2000       0.16      0.02      0.03       381
        3000       0.00      0.00      0.00        11
        4000       0.00      0.00      0.00         4
        5000       0.00      0.00      0.00         4
        6000       0.00      0.00      0.00         2
        8000       0.00      0.00      0.00         1

    accuracy                           0.19      3000
   macro avg       0.07      0.07      0.03      3000
weighted avg       0.16   

  _warn_prf(average, modifier, msg_start, len(result))
