**In the world of games, there are some classics that are never going out of style.**

One of those is Mario Party, which has become a staple in gaming culture over the past 20 years.

Another game that's been around for decades is The Legend of Zelda: Ocarina of Time. Both of these games were made by Nintendo and have helped define the company's brand identity in recent years.

In [1]:
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
url = 'https://www.reddit.com/hot.json'

In [5]:
headers = {'User-agent':'Eye eye bot 00'}

In [6]:

res = requests.get(url,headers=headers)

In [7]:
res.status_code

200

In [10]:
the_json = res.json()

In [11]:
sorted(the_json.keys())

['data', 'kind']

In [13]:
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'geo_filter', 'modhash']

In [14]:
len(the_json['data']['children'])

25

In [16]:
pd.DataFrame(the_json['data']['children'])

Unnamed: 0,kind,data
0,t3,"{'approved_at_utc': None, 'subreddit': 'AskRed..."
1,t3,"{'approved_at_utc': None, 'subreddit': 'meirl'..."
2,t3,"{'approved_at_utc': None, 'subreddit': 'DnD', ..."
3,t3,"{'approved_at_utc': None, 'subreddit': 'antiwo..."
4,t3,"{'approved_at_utc': None, 'subreddit': 'funny'..."
5,t3,"{'approved_at_utc': None, 'subreddit': 'WhiteP..."
6,t3,"{'approved_at_utc': None, 'subreddit': 'politi..."
7,t3,"{'approved_at_utc': None, 'subreddit': 'nba', ..."
8,t3,"{'approved_at_utc': None, 'subreddit': 'yousee..."
9,t3,"{'approved_at_utc': None, 'subreddit': 'atheis..."


In [17]:
the_json['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'AskReddit',
 'selftext': '',
 'author_fullname': 't2_8ub49van',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'What is something ancient that only an Internet Veteran can remember?',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/AskReddit',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': None,
 'top_awarded_type': None,
 'hide_score': False,
 'name': 't3_xz2zt9',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'upvote_ratio': 0.94,
 'author_flair_background_color': None,
 'subreddit_type': 'public',
 'ups': 3402,
 'total_awards_received': 2,
 'media_embed': {},
 'thumbnail_width': None,
 'author_flair_template_id': None,
 'is_original_content': False,
 'user_reports': [],
 'secure_media': None,
 'is_reddit_media_domain': False,
 'is_meta': False,
 'category': None,
 'secure_media_embed': {},
 'link_flair_text': None,
 'can_mod_post': Fal

In [18]:
len(the_json['data']['children'])

25

In [19]:
the_json['data']['after']

't3_xz3tn3'

In [21]:
[post['data']['name'] for post in the_json['data']['children']]

['t3_xz2zt9',
 't3_xz6iys',
 't3_xys8ez',
 't3_xz6wxs',
 't3_xz606h',
 't3_xz3j1m',
 't3_xz5kdd',
 't3_xz2lol',
 't3_xz5b7t',
 't3_xz4g6n',
 't3_xz6jss',
 't3_xz0zxm',
 't3_xyvzdl',
 't3_xz6yke',
 't3_xz4zte',
 't3_xyv3w9',
 't3_xz2clc',
 't3_xyx9q2',
 't3_xz7fpc',
 't3_xz26n3',
 't3_xz5llc',
 't3_xz224y',
 't3_xyrabm',
 't3_xz3ft0',
 't3_xz3tn3']

In [22]:
param = {'after':'t3_xz3tn3'}

In [23]:
requests.get(url,params =param, headers= headers )

<Response [200]>

In [28]:
# The function 'reddit_to_csv' will take three arguments: 
# 1. the subreddit being scraped; 2. the filename, or the name
# the csv file will be given; and 3. the number of requests 
# the user would like to make of reddit's API. 

def reddit_to_csv(subreddit, filename, n_requests=1):
    
    #Create an empty list to be used later in function:
    posts = []
    
    #Create User-Agent to avoid 429 res.status_code:
    headers = {'User-agent':'Eye eye bot 00'}
    
    #Establish that 'after' (a variable used later) is None type:
    after = None
    
    #for loop n_requests iterations (n_requests is established by user):
    for i in range(n_requests):
        print(i)
        
        if after == None:
            params = {}
        else:
            params = {'after': after}
        #Assign 'url' to reddit's base url, plus whatever subreddit 
        #the user provides,plus .json for clean results:
        #url = 'https://www.reddit.com/hot.json'
        url = 'https://www.reddit.com/' + str(subreddit) + '/.json'
        
        #Set my res variable equal to the results from requests.get, 
        #and the parameters set above like 'url' or 'params':
        res = requests.get(url,params=params,headers=headers)
        
        #Conditional statement to ensure access to the API is approved:
        if res.status_code ==200:
            the_json = res.json()
            
            for x in range(len(the_json['data']['children'])):
                
                #Create temporary dictionary to add results of each post to:
                temp_dict = {}
                #After looking through the json results, I've selected the below information about the posts
                #as those that can potentially add value to my model's results.
                temp_dict['subreddit'] = the_json['data']['children'][x]['data']['subreddit']
                temp_dict['title'] = the_json['data']['children'][x]['data']['title']
                temp_dict['post_paragraph'] = the_json['data']['children'][x]['data']['selftext']
                temp_dict['clicked'] = the_json['data']['children'][x]['data']['clicked']
                temp_dict['ups'] = the_json['data']['children'][x]['data']['ups']
                temp_dict['downs'] = the_json['data']['children'][x]['data']['downs']
                temp_dict['likes'] = the_json['data']['children'][x]['data']['likes']
                temp_dict['category'] = the_json['data']['children'][x]['data']['category']
                temp_dict['number_of_comments'] = the_json['data']['children'][x]['data']['num_comments']
                temp_dict['score'] = the_json['data']['children'][x]['data']['score']
                temp_dict['author_flair_css_class'] = the_json['data']['children'][x]['data']['author_flair_css_class']
                temp_dict['subreddit_type'] = the_json['data']['children'][x]['data']['subreddit_type']
                
                #Add the temporary dictionary to 'posts',the list of each post's dictionary of information:
                posts.append(temp_dict)
                #posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
            
        else:
            print(res.status_code)
            break
        time.sleep(1)
        
    #Turn the list of post dictionaries into a pandas DataFrame:
    posts_df = pd.DataFrame(posts)
    
    #Drop any duplicate rows that may have been pulled:
    posts_df.drop_duplicates(inplace = True)
    
    #Rearrange the columns into a more logical order:
    posts_df = posts_df[['subreddit', 'title', 'clicked', 'ups', 'downs', 'post_paragraph', 'likes', 'number_of_comments', 'category', 'score', 'author_flair_css_class', 'subreddit_type']]
    
    #Save the DataFrame as a .csv file:
    posts_df.to_csv(str(filename), index = False, sep = ",")


        

In [30]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/MARIOPARTY',
              n_requests = 150,
              filename = 'mario_party_reddit_posts.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


In [31]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/zelda',
              n_requests = 150,
              filename = 'zelda_reddit_posts.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


In [33]:
mario_party_df = pd.read_csv('./mario_party_reddit_posts.csv')
mario_party_df.head(2)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,MARIOPARTY,Looking for buddies to play Mario Party Supers...,False,290,0,,,80,,290,wario,public
1,MARIOPARTY,Mario Party Superstars Most Wanted DLC 🌟 Surve...,False,178,0,Thanks all for filling out our survey!\n\nWith...,,49,,178,wario,public


In [34]:
mario_party_df.shape

(2110, 12)

In [37]:
zelda_df = pd.read_csv('./zelda_reddit_posts.csv')
zelda_df.head(2)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,zelda,r/Zelda Meta Discussion - Rule 2: Mark Spoilers,False,5,0,"Hi r/Zelda,\n\nTo continue discussing the subr...",,4,,5,rito,public
1,zelda,Today is Self-Post Sunday. Only self-posts are...,False,2,0,Self-Post Sundays are our main discussion day....,,0,,2,hylian,public


In [39]:
zelda_df.shape

(2431, 12)

**Appened the Mario and Zelda files**

In [41]:
df = mario_party_df.append(zelda_df, ignore_index=True)

In [44]:
df.shape

(4541, 12)

**CSV CHECKPOINT (check the data again).**

In [45]:
df.head(5)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type
0,MARIOPARTY,Looking for buddies to play Mario Party Supers...,False,290,0,,,80,,290,wario,public
1,MARIOPARTY,Mario Party Superstars Most Wanted DLC 🌟 Surve...,False,178,0,Thanks all for filling out our survey!\n\nWith...,,49,,178,wario,public
2,MARIOPARTY,let’s talk about our favourite mario party boa...,False,10,0,go ahead⬇️⬇️⬇️,,36,,10,noflair,public
3,MARIOPARTY,what is your favorite mario party board of all...,False,1,0,Keep the responses comin!,,0,,1,noflair,public
4,MARIOPARTY,Mario Party 8 BS,False,1,0,I am trying to unlock hammer bro or blooper. I...,,3,,1,noflair,public


In [46]:
df.shape

(4541, 12)

In [47]:
zelda_df.shape

(2431, 12)

In [49]:
mario_party_df.shape

(2110, 12)

In [50]:
2431+2110

4541

**Create a 'target' column (will equal 1 if the post's subreddit is Mario Party, and 0 if the post's subreddit is Smash Bros. Ultimate):**

In [51]:
df['target'] = np.where(df['subreddit'] == 'MARIOPARTY', 1, 0)

In [53]:
df.head(3)

Unnamed: 0,subreddit,title,clicked,ups,downs,post_paragraph,likes,number_of_comments,category,score,author_flair_css_class,subreddit_type,target
0,MARIOPARTY,Looking for buddies to play Mario Party Supers...,False,290,0,,,80,,290,wario,public,1
1,MARIOPARTY,Mario Party Superstars Most Wanted DLC 🌟 Surve...,False,178,0,Thanks all for filling out our survey!\n\nWith...,,49,,178,wario,public,1
2,MARIOPARTY,let’s talk about our favourite mario party boa...,False,10,0,go ahead⬇️⬇️⬇️,,36,,10,noflair,public,1


**Look for columns that don't have any values and can be dropped**

In [54]:
df['likes'].isnull().sum()

4541

In [55]:
df['category'].isnull().sum()

4541

**The column 'clicked' is not empty, but the column values are purely False, therefore I will drop 'clicked' as well. The same for columns 'downs' and 'subreddit_type' which are purely 0's and 'public', respectively.**

In [56]:
df['clicked'].value_counts()

False    4541
Name: clicked, dtype: int64

In [57]:
df['downs'].value_counts()

0    4541
Name: downs, dtype: int64

In [58]:
df['subreddit_type'].value_counts()

public    4541
Name: subreddit_type, dtype: int64

In [59]:
df_drop_list = ['likes', 'category', 'clicked', 'downs', 'subreddit_type']

In [60]:
df.drop(df_drop_list, axis=1, inplace=True)

In [61]:
df.shape

(4541, 8)

In [62]:
df.to_csv('master_df.csv', index=False, sep=",")

**Set Tokenizer**

In [63]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [64]:
lemmatizer = WordNetLemmatizer()

In [65]:
df = pd.read_csv('./master_df.csv')

In [66]:
df.columns

Index(['subreddit', 'title', 'ups', 'post_paragraph', 'number_of_comments',
       'score', 'author_flair_css_class', 'target'],
      dtype='object')

**Create function that takes a column containing text and returns the lemmatized version in a new 'cleaned' column**

In [67]:
def column_cleaner(column, df=df):
    #For some reason, I was running into errors trying to run this code until I added the code
    #below (df[column+'_clean'] = ""), establishing from the beginning that the new column to be created
    #exists in the dataframe and contains nothing but empty strings.
    
    df[column+'_clean'] = ""
    
    #for loop through each row in the column:
    for i in range(len(df[column])):
        
        #Tokenize, or separate, each word in column's string into its own string (prep for lemmatization):
        col_tok = []
        col_tok.extend(tokenizer.tokenize(df[column][i].lower()))
        col_token = []
        [col_token.append(s) for s in col_tok if s not in col_token]
        
        #Lemmatize the words (cut the word to its base/root, for improved model results):
        col_lem = []
        for x in range(len(col_token)):
            col_lem.append(lemmatizer.lemmatize(col_token[x]))
        
        #Remove characters and numbers (for improved model results, hopefully):
        letters_only_col = []
        for c in range(len(col_lem)):
            letters_only_col.append(re.sub("[^a-zA-Z]", "", col_lem[c]))
        
        #Remove stopwords (for improved model results):
        col_words = [w for w in letters_only_col if not w in stopwords.words('english')]
        
        #Remove 'cheat' words (words that are in the subreddit's name and also in the column)
        cheat_words = ['mario', 'party', 'marioparty', 'smash', 'bros', 'ultimate', 'smashbrosultimate', 'super']
        col_words = [w for w in letters_only_col if not w in cheat_words]
        
        #Ensure that there are no 'None' objects in title_words:
        col_words = list(filter(None, col_words))

        #Join the lemmatized words - stopwords back to one long string (prep for
        #vectorization, done outside/after this function):
        col_words = " ".join(col_words)

        #Fill new column with 'cleaned' string from column:
        df[column+'_clean'][i] = col_words

In [68]:
column_cleaner(column='title', df=df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column+'_clean'][i] = col_words


In [69]:
df['post_paragraph'].head()

0                                                  NaN
1    Thanks all for filling out our survey!\n\nWith...
2                                       go ahead⬇️⬇️⬇️
3                            Keep the responses comin!
4    I am trying to unlock hammer bro or blooper. I...
Name: post_paragraph, dtype: object

In [70]:
df['post_paragraph'] = df['post_paragraph'].replace(np.nan, "")

In [71]:
column_cleaner(column='post_paragraph', df=df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column+'_clean'][i] = col_words


**Save version of DataFrame**

In [72]:
df.to_csv('master_df_cleaned.csv', index=False, sep=",")

In [73]:
df.head(3)

Unnamed: 0,subreddit,title,ups,post_paragraph,number_of_comments,score,author_flair_css_class,target,title_clean,post_paragraph_clean
0,MARIOPARTY,Looking for buddies to play Mario Party Supers...,290,,80,290,wario,1,looking for buddy to play superstar on switch ...,
1,MARIOPARTY,Mario Party Superstars Most Wanted DLC 🌟 Surve...,178,Thanks all for filling out our survey!\n\nWith...,49,178,wario,1,superstar most wanted dlc survey result,thanks all for filling out our survey with res...
2,MARIOPARTY,let’s talk about our favourite mario party boa...,10,go ahead⬇️⬇️⬇️,36,10,noflair,1,let s talk about our favourite board because w...,go ahead


**Create CountVectorize Function**

In [74]:
def count_vec_column(column, func_df=df):
    #Instantiate CountVectorizer:
    vect = CountVectorizer()
    
    #Create temporary variable X_text that takes on the fit/transformed results of the column:
    X_text = vect.fit_transform(func_df[column])
    
    #Turn X_text into an array (prep to easily make a DataFrame):
    X_text = X_text.toarray()
    
    #Create a temporary DataFrame with each word/word-pair/word-group as the columns:
    temp_df = pd.DataFrame(X_text,
                           columns=vect.get_feature_names())
    
    #Add the original column name to the beginning of the new columns' names to differentiate from which column
    # the vectorized words came from (this may impact the strength of the model):
    for i in range(len(temp_df.columns)):
        #print(i)
        temp_df.rename(columns={temp_df.columns[i]: column + '_' + temp_df.columns[i]}, inplace=True)
    
    #Combine the two DataFrames:
    func_df = pd.concat([func_df, temp_df], axis=1, join_axes=[func_df.index])
    return func_df

**Create TF-IDF Function**

In [75]:
def tfidf_column(column, func_df=df):
    #Instantiate TfidfVectorizer:
    tfidf_vect = TfidfVectorizer()
    
    #Create temporary variable X_text that takes on the fit/transformed results of the column:
    X_text = tfidf_vect.fit_transform(func_df[column])
    
    #Turn X_text into an array (prep to easily make a DataFrame):
    X_text = X_text.toarray()
    
    #Create a temporary DataFrame with each word/word-pair/word-group as the columns:
    temp_df = pd.DataFrame(X_text,
                           columns=tfidf_vect.get_feature_names())
    
    #Add the original column name to the beginning of the new columns' names to differentiate from which column
    # the tf-idf vectorized words came from (this may impact the strength of the model):
    for i in range(len(temp_df.columns)):
        #print(i)
        temp_df.rename(columns={temp_df.columns[i]: column + '_' + temp_df.columns[i]}, inplace=True)
    
    #Combine the two DataFrames:
    func_df = pd.concat([func_df, temp_df], axis=1, join_axes=[func_df.index])
    return func_df