In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
%matplotlib inline

### Load data

In [2]:
show_data = pd.read_csv('../data/JEOPARDY_CSV.csv')
show_data.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [3]:
show_data.nunique()

Show Number      3640
 Air Date        3640
 Round              4
 Category       27995
 Value            149
 Question      216124
 Answer         88267
dtype: int64

In [4]:
show_data.drop('Show Number', axis=1, inplace=True)
show_data.rename(columns=lambda x: x.strip(), inplace=True)
show_data = show_data[~(show_data['Round'].isin(["Final Jeopardy!", "Tiebreaker"]))]

In [5]:
show_data['Value'] = show_data['Value'].str.replace("$", '')
show_data['Value'] = show_data['Value'].str.replace(",", '')
show_data['Value'] = show_data['Value'].astype(int)
show_data.head()

Unnamed: 0,Air Date,Round,Category,Value,Question,Answer
0,2004-12-31,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona
3,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


### EDA

In [6]:
show_data['Air Date'] = pd.to_datetime(show_data['Air Date'])
str(show_data['Air Date'].dt.date.min()) + ' to ' +str(show_data['Air Date'].dt.date.max())

'1984-09-10 to 2012-01-27'

In [7]:
plt_df = show_data.groupby(show_data['Air Date'].dt.year).mean('Value')
plt_df.plot(xticks=plt_df.index, rot=90);

In [8]:
show_data['air_date_group'] = show_data['Air Date'].apply(lambda x: 'pre-2002' if x.year < 2002 else 'post-2002')
show_data.drop('Air Date', axis=1, inplace=True)
show_data.head()

Unnamed: 0,Round,Category,Value,Question,Answer,air_date_group
0,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus,post-2002
1,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,post-2002
2,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona,post-2002
3,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,post-2002
4,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams,post-2002


In [9]:
show_data.drop('Answer', axis=1, inplace=True)
show_data.head()

Unnamed: 0,Round,Category,Value,Question,air_date_group
0,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",post-2002
1,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,post-2002
2,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,post-2002
3,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",post-2002
4,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",post-2002


In [10]:
plt_df = show_data.groupby('Round').mean('Value')
plt_df.plot.bar();

In [11]:
show_data['Category'].nunique()

26951

In [12]:
show_data.drop('Category', axis=1, inplace=True)

In [13]:
def binning(value):
    if value < 1000:
        return np.round(value, -2)
    elif value < 10000:
        return np.round(value, -3)
    else:
        return np.round(value, -4)

show_data['Value'] = show_data['Value'].apply(binning)

In [14]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()
def text_preprocessing(text):
    return " ".join([lemmatizer.lemmatize(w) for w in tokenizer.tokenize(text)])
show_data['Question'] = show_data['Question'].apply(text_preprocessing)
show_data.head()

Unnamed: 0,Round,Value,Question,air_date_group
0,Jeopardy!,200,"For the last 8 year of his life, Galileo wa un...",post-2002
1,Jeopardy!,200,No. 2: 1912 Olympian; football star at Carlisl...,post-2002
2,Jeopardy!,200,The city of Yuma in this state ha a record ave...,post-2002
3,Jeopardy!,200,"In 1963, live on ""The Art Linkletter Show"", th...",post-2002
4,Jeopardy!,200,"Signer of the Dec. of Indep., framer of the Co...",post-2002


In [16]:
show_data.describe()

Unnamed: 0,Value
count,213296.0
mean,762.394513
std,664.001766
min,0.0
25%,400.0
50%,600.0
75%,1000.0
max,20000.0


### Model creation

In [17]:
y = show_data['Value']
X = show_data[['Round', 'air_date_group', 'Question']]
column_trans = ColumnTransformer([('Round', OneHotEncoder(dtype='int'),['Round']),
                                  ('air_date_group', OneHotEncoder(dtype='int'),['air_date_group']),
                                  ('Question', TfidfVectorizer(stop_words='english'), 'Question')],
                                remainder='drop')
X = column_trans.fit_transform(X)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [19]:
clf=RandomForestClassifier(n_jobs=-1)
clf.fit(X_train,y_train)

In [20]:
preds = clf.predict(X_test)

In [22]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
         100       0.23      0.25      0.24      2669
         200       0.22      0.29      0.25      9155
         300       0.20      0.17      0.18      2553
         400       0.21      0.20      0.20     12603
         500       0.22      0.19      0.20      2651
         600       0.19      0.15      0.17      6099
         700       0.00      0.00      0.00        57
         800       0.20      0.10      0.13      9743
         900       0.00      0.00      0.00        39
        1000       0.23      0.18      0.21     10163
        2000       0.39      0.78      0.52      7548
        3000       0.00      0.00      0.00       328
        4000       0.00      0.00      0.00       176
        5000       0.00      0.00      0.00       103
        6000       0.00      0.00      0.00        49
        7000       0.00      0.00      0.00        16
        8000       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
