**In the world of games, there are some classics that are never going out of style.**

One of those is Mario Party, which has become a staple in gaming culture over the past 20 years.

Another game that's been around for decades is The Legend of Zelda: Ocarina of Time. Both of these games were made by Nintendo and have helped define the company's brand identity in recent years.

In [1]:
import requests
import time
import nltk
import pandas as pd
import regex as re
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
url = 'https://www.reddit.com/hot.json'

In [3]:
headers = {'User-agent':'Eye eye bot 00'}

In [4]:

res = requests.get(url,headers=headers)

In [5]:
res.status_code

200

In [6]:
the_json = res.json()

In [7]:
sorted(the_json.keys())

['data', 'kind']

In [8]:
sorted(the_json['data'].keys())

['after', 'before', 'children', 'dist', 'geo_filter', 'modhash']

In [9]:
len(the_json['data']['children'])

25

In [10]:
pd.DataFrame(the_json['data']['children'])

Unnamed: 0,kind,data
0,t3,"{'approved_at_utc': None, 'subreddit': 'formul..."
1,t3,"{'approved_at_utc': None, 'subreddit': 'gadget..."
2,t3,"{'approved_at_utc': None, 'subreddit': 'MadeMe..."
3,t3,"{'approved_at_utc': None, 'subreddit': 'intere..."
4,t3,"{'approved_at_utc': None, 'subreddit': 'news',..."
5,t3,"{'approved_at_utc': None, 'subreddit': 'AskRed..."
6,t3,"{'approved_at_utc': None, 'subreddit': 'Terrif..."
7,t3,"{'approved_at_utc': None, 'subreddit': 'oddlyt..."
8,t3,"{'approved_at_utc': None, 'subreddit': 'aww', ..."
9,t3,"{'approved_at_utc': None, 'subreddit': 'nba', ..."


In [11]:
the_json['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'formula1',
 'selftext': '',
 'author_fullname': 't2_b6e4ydm4',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 1,
 'clicked': False,
 'title': 'Gasly close to tractor on track under red flag',
 'link_flair_richtext': [{'a': ':post-video:',
   'e': 'emoji',
   'u': 'https://emoji.redditmedia.com/ul37w8s0jjn71_t5_2qimj/post-video'},
  {'e': 'text', 't': ' Video /r/all'}],
 'subreddit_name_prefixed': 'r/formula1',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': 'sub-all',
 'downs': 0,
 'thumbnail_height': 78,
 'top_awarded_type': None,
 'hide_score': False,
 'name': 't3_xzdf4u',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'upvote_ratio': 0.97,
 'author_flair_background_color': '#00a19c',
 'subreddit_type': 'public',
 'ups': 11577,
 'total_awards_received': 18,
 'media_embed': {},
 'thumbnail_width': 140,
 'author_flair_template_id': 'fd928abe-dd1a-11eb-b9ac-0e054c859a35',
 'is_original_content': False,
 'user_reports': []

In [12]:
len(the_json['data']['children'])

25

In [13]:
the_json['data']['after']

't3_xze4c0'

In [14]:
[post['data']['name'] for post in the_json['data']['children']]

['t3_xzdf4u',
 't3_xza0eu',
 't3_xzbqle',
 't3_xz9tr9',
 't3_xz8yuf',
 't3_xz2zt9',
 't3_xz3sza',
 't3_xz9kp4',
 't3_xz8qw6',
 't3_xz2lol',
 't3_xz0zxm',
 't3_xyx9q2',
 't3_xz7fpc',
 't3_xz8qof',
 't3_xz6wxs',
 't3_xz0d7c',
 't3_xz6iys',
 't3_xzdpug',
 't3_xz6yke',
 't3_xz6zgr',
 't3_xyskr6',
 't3_xz26ob',
 't3_xz5kdd',
 't3_xz8vv5',
 't3_xze4c0']

In [15]:
param = {'after':'t3_xz3tn3'}

In [16]:
requests.get(url,params =param, headers= headers )

<Response [200]>

In [17]:
# The function 'reddit_to_csv' will take three arguments: 
# 1. the subreddit being scraped; 2. the filename, or the name
# the csv file will be given; and 3. the number of requests 
# the user would like to make of reddit's API. 

def reddit_to_csv(subreddit, filename, n_requests=1):
    
    #Create an empty list to be used later in function:
    posts = []
    
    #Create User-Agent to avoid 429 res.status_code:
    headers = {'User-agent':'Eye eye bot 00'}
    
    #Establish that 'after' (a variable used later) is None type:
    after = None
    
    #for loop n_requests iterations (n_requests is established by user):
    for i in range(n_requests):
        print(i)
        
        if after == None:
            params = {}
        else:
            params = {'after': after}
        #Assign 'url' to reddit's base url, plus whatever subreddit 
        #the user provides,plus .json for clean results:
        #url = 'https://www.reddit.com/hot.json'
        url = 'https://www.reddit.com/' + str(subreddit) + '/.json'
        
        #Set my res variable equal to the results from requests.get, 
        #and the parameters set above like 'url' or 'params':
        res = requests.get(url,params=params,headers=headers)
        
        #Conditional statement to ensure access to the API is approved:
        if res.status_code ==200:
            the_json = res.json()
            
            for x in range(len(the_json['data']['children'])):
                
                #Create temporary dictionary to add results of each post to:
                temp_dict = {}
                #After looking through the json results, I've selected the below information about the posts
                #as those that can potentially add value to my model's results.
                temp_dict['subreddit'] = the_json['data']['children'][x]['data']['subreddit']
                temp_dict['title'] = the_json['data']['children'][x]['data']['title']
                temp_dict['post_paragraph'] = the_json['data']['children'][x]['data']['selftext']
                temp_dict['clicked'] = the_json['data']['children'][x]['data']['clicked']
                temp_dict['ups'] = the_json['data']['children'][x]['data']['ups']
                temp_dict['downs'] = the_json['data']['children'][x]['data']['downs']
                temp_dict['likes'] = the_json['data']['children'][x]['data']['likes']
                temp_dict['category'] = the_json['data']['children'][x]['data']['category']
                temp_dict['number_of_comments'] = the_json['data']['children'][x]['data']['num_comments']
                temp_dict['score'] = the_json['data']['children'][x]['data']['score']
                temp_dict['author_flair_css_class'] = the_json['data']['children'][x]['data']['author_flair_css_class']
                temp_dict['subreddit_type'] = the_json['data']['children'][x]['data']['subreddit_type']
                
                #Add the temporary dictionary to 'posts',the list of each post's dictionary of information:
                posts.append(temp_dict)
                #posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
            
        else:
            print(res.status_code)
            break
        time.sleep(1)
        
    #Turn the list of post dictionaries into a pandas DataFrame:
    posts_df = pd.DataFrame(posts)
    
    #Drop any duplicate rows that may have been pulled:
    posts_df.drop_duplicates(inplace = True)
    
    #Rearrange the columns into a more logical order:
    posts_df = posts_df[['subreddit', 'title', 'clicked', 'ups', 'downs', 'post_paragraph', 'likes', 'number_of_comments', 'category', 'score', 'author_flair_css_class', 'subreddit_type']]
    
    #Save the DataFrame as a .csv file:
    posts_df.to_csv(str(filename), index = False, sep = ",")


        

In [None]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/MARIOPARTY',
              n_requests = 150,
              filename = 'mario_party_reddit_posts.csv')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


In [None]:
#Load and save the data as CSV
reddit_to_csv(subreddit = 'r/zelda',
              n_requests = 150,
              filename = 'zelda_reddit_posts.csv')

In [None]:
mario_party_df = pd.read_csv('./mario_party_reddit_posts.csv')
mario_party_df.head(2)

In [None]:
mario_party_df.shape

In [None]:
zelda_df = pd.read_csv('./zelda_reddit_posts.csv')
zelda_df.head(2)

In [None]:
zelda_df.shape

**Appened the Mario and Zelda files**

In [None]:
df = mario_party_df.append(zelda_df, ignore_index=True)

In [None]:
df.shape

**CSV CHECKPOINT (check the data again).**

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
zelda_df.shape

In [None]:
mario_party_df.shape

In [None]:
2431+2110

**Create a 'target' column (will equal 1 if the post's subreddit is Mario Party, and 0 if the post's subreddit is Smash Bros. Ultimate):**

In [None]:
df['target'] = np.where(df['subreddit'] == 'MARIOPARTY', 1, 0)

In [None]:
df.head(3)

**Look for columns that don't have any values and can be dropped**

In [None]:
df['likes'].isnull().sum()

In [None]:
df['category'].isnull().sum()

**The column 'clicked' is not empty, but the column values are purely False, therefore I will drop 'clicked' as well. The same for columns 'downs' and 'subreddit_type' which are purely 0's and 'public', respectively.**

In [None]:
df['clicked'].value_counts()

In [None]:
df['downs'].value_counts()

In [None]:
df['subreddit_type'].value_counts()

In [None]:
df_drop_list = ['likes', 'category', 'clicked', 'downs', 'subreddit_type']

In [None]:
df.drop(df_drop_list, axis=1, inplace=True)

In [None]:
df.shape

In [None]:
df.to_csv('master_df.csv', index=False, sep=",")

**Set Tokenizer**

In [None]:
tokenizer = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
df = pd.read_csv('./master_df.csv')

In [None]:
df.columns

**Create function that takes a column containing text and returns the lemmatized version in a new 'cleaned' column**

In [None]:
def column_cleaner(column, df=df):
    #For some reason, I was running into errors trying to run this code until I added the code
    #below (df[column+'_clean'] = ""), establishing from the beginning that the new column to be created
    #exists in the dataframe and contains nothing but empty strings.
    
    df[column+'_clean'] = ""
    
    #for loop through each row in the column:
    for i in range(len(df[column])):
        
        #Tokenize, or separate, each word in column's string into its own string (prep for lemmatization):
        col_tok = []
        col_tok.extend(tokenizer.tokenize(df[column][i].lower()))
        col_token = []
        [col_token.append(s) for s in col_tok if s not in col_token]
        
        #Lemmatize the words (cut the word to its base/root, for improved model results):
        col_lem = []
        for x in range(len(col_token)):
            col_lem.append(lemmatizer.lemmatize(col_token[x]))
        
        #Remove characters and numbers (for improved model results, hopefully):
        letters_only_col = []
        for c in range(len(col_lem)):
            letters_only_col.append(re.sub("[^a-zA-Z]", "", col_lem[c]))
        
        #Remove stopwords (for improved model results):
        col_words = [w for w in letters_only_col if not w in stopwords.words('english')]
        
        #Remove 'cheat' words (words that are in the subreddit's name and also in the column)
        cheat_words = ['mario', 'party', 'marioparty', 'smash', 'bros', 'ultimate', 'smashbrosultimate', 'super']
        col_words = [w for w in letters_only_col if not w in cheat_words]
        
        #Ensure that there are no 'None' objects in title_words:
        col_words = list(filter(None, col_words))

        #Join the lemmatized words - stopwords back to one long string (prep for
        #vectorization, done outside/after this function):
        col_words = " ".join(col_words)

        #Fill new column with 'cleaned' string from column:
        df[column+'_clean'][i] = col_words

In [None]:
column_cleaner(column='title', df=df)

In [None]:
df['post_paragraph'].head()

In [None]:
df['post_paragraph'] = df['post_paragraph'].replace(np.nan, "")

In [None]:
column_cleaner(column='post_paragraph', df=df)

**Save version of DataFrame**

In [None]:
df.to_csv('master_df_cleaned.csv', index=False, sep=",")

In [None]:
df.head(3)

**Create CountVectorize Function**

In [None]:
def count_vec_column(column, func_df=df):
    #Instantiate CountVectorizer:
    vect = CountVectorizer()
    
    #Create temporary variable X_text that takes on the fit/transformed results of the column:
    X_text = vect.fit_transform(func_df[column])
    
    #Turn X_text into an array (prep to easily make a DataFrame):
    X_text = X_text.toarray()
    
    #Create a temporary DataFrame with each word/word-pair/word-group as the columns:
    temp_df = pd.DataFrame(X_text,
                           columns=vect.get_feature_names())
    
    #Add the original column name to the beginning of the new columns' names to differentiate from which column
    # the vectorized words came from (this may impact the strength of the model):
    for i in range(len(temp_df.columns)):
        #print(i)
        temp_df.rename(columns={temp_df.columns[i]: column + '_' + temp_df.columns[i]}, inplace=True)
    
    #Combine the two DataFrames:
    func_df = pd.concat([func_df, temp_df], axis=1, join_axes=[func_df.index])
    return func_df

**Create TF-IDF Function**

In [None]:
def tfidf_column(column, func_df=df):
    #Instantiate TfidfVectorizer:
    tfidf_vect = TfidfVectorizer()
    
    #Create temporary variable X_text that takes on the fit/transformed results of the column:
    X_text = tfidf_vect.fit_transform(func_df[column])
    
    #Turn X_text into an array (prep to easily make a DataFrame):
    X_text = X_text.toarray()
    
    #Create a temporary DataFrame with each word/word-pair/word-group as the columns:
    temp_df = pd.DataFrame(X_text,
                           columns=tfidf_vect.get_feature_names())
    
    #Add the original column name to the beginning of the new columns' names to differentiate from which column
    # the tf-idf vectorized words came from (this may impact the strength of the model):
    for i in range(len(temp_df.columns)):
        #print(i)
        temp_df.rename(columns={temp_df.columns[i]: column + '_' + temp_df.columns[i]}, inplace=True)
    
    #Combine the two DataFrames:
    func_df = pd.concat([func_df, temp_df], axis=1, join_axes=[func_df.index])
    return func_df