In [1]:
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
import re

In [2]:
df_tennis = pd.read_csv('./data/tennis.csv')
df_pickleball = pd.read_csv('./data/pickleball.csv')

In [3]:
df_tennis.columns

Index(['Unnamed: 0', 'subreddit', 'title', 'selftext', 'created_utc'], dtype='object')

In [4]:
df_pickleball.columns

Index(['Unnamed: 0', 'subreddit', 'title', 'selftext', 'created_utc'], dtype='object')

In [5]:
df_tennis.drop(columns='Unnamed: 0', inplace=True)
df_pickleball.drop(columns='Unnamed: 0', inplace=True)

In [6]:
df_tennis.head(5)

Unnamed: 0,subreddit,title,selftext,created_utc
0,tennis,Anyone down for a hit in Los Angeles?,"Hey, people! I'm a dude in his late twenties g...",1637458418
1,tennis,Zverev on Peng Shuai?,Any words from Zverev about Peng Shuai? Not su...,1637457638
2,tennis,Zverev beats Djokovic to set up Medvedev final...,,1637457620
3,tennis,Peng Shuai: China blocks CNN's signal to preve...,,1637456757
4,tennis,Peng Shuai situation explained: Chinese state ...,,1637454383


In [7]:
df_pickleball.head(5)

Unnamed: 0,subreddit,title,selftext,created_utc
0,Pickleball,Crazy Rules Question,[removed],1637457988
1,Pickleball,43 Court-Tested Pickleball Tips To Win Points ...,,1637445886
2,Pickleball,Phoenix Pickleball scene,[removed],1637437159
3,Pickleball,Phoenix Pickleball,[removed],1637418798
4,Pickleball,What's One Thing You Find Most Lacking In The ...,"Could be something like high quality reviews, ...",1637354289


In [8]:
df_pickleball['subreddit'].replace({'Pickleball': 'pickleball'})

0       pickleball
1       pickleball
2       pickleball
3       pickleball
4       pickleball
           ...    
1994    pickleball
1995    pickleball
1996    pickleball
1997    pickleball
1998    pickleball
Name: subreddit, Length: 1999, dtype: object

In [9]:
df_pickleball['subreddit'].replace({'Pickleball': 'pickleball'}, inplace=True)

In [10]:
len(df_tennis), len(df_pickleball)

(5000, 1999)

In [11]:
# remove duplicates
df_tennis.drop_duplicates(subset='title', inplace=True)
df_pickleball.drop_duplicates(subset='title', inplace=True)

len(df_tennis), len(df_pickleball)

(4913, 1955)

In [12]:
df_tennis.head()

Unnamed: 0,subreddit,title,selftext,created_utc
0,tennis,Anyone down for a hit in Los Angeles?,"Hey, people! I'm a dude in his late twenties g...",1637458418
1,tennis,Zverev on Peng Shuai?,Any words from Zverev about Peng Shuai? Not su...,1637457638
2,tennis,Zverev beats Djokovic to set up Medvedev final...,,1637457620
3,tennis,Peng Shuai: China blocks CNN's signal to preve...,,1637456757
4,tennis,Peng Shuai situation explained: Chinese state ...,,1637454383


In [13]:
df_pickleball.head()

Unnamed: 0,subreddit,title,selftext,created_utc
0,pickleball,Crazy Rules Question,[removed],1637457988
1,pickleball,43 Court-Tested Pickleball Tips To Win Points ...,,1637445886
2,pickleball,Phoenix Pickleball scene,[removed],1637437159
3,pickleball,Phoenix Pickleball,[removed],1637418798
4,pickleball,What's One Thing You Find Most Lacking In The ...,"Could be something like high quality reviews, ...",1637354289


In [14]:
# Check for nulls
df_tennis.isnull().sum()

subreddit         0
title             0
selftext       2779
created_utc       0
dtype: int64

In [15]:
df_pickleball.isnull().sum()

subreddit        0
title            0
selftext       774
created_utc      0
dtype: int64

In [16]:
# [removed] and [deleted] messages
df_tennis[df_tennis['selftext'] == '[removed]'].shape, df_tennis[df_tennis['selftext'] == '[deleted]'].shape
df_pickleball[df_pickleball['selftext'] == '[removed]'].shape, df_pickleball[df_pickleball['selftext'] == '[deleted]'].shape

((82, 4), (18, 4))

In [17]:
# Since there's selftext with [removed] and [deleted], we'll make null [none]
df_tennis['selftext'].fillna('[None]', inplace=True)
df_pickleball['selftext'].fillna('[None]', inplace=True)
df_tennis[df_tennis['selftext'] == '[None]'].shape, df_pickleball[df_pickleball['selftext'] == '[None]'].shape

((2779, 4), (774, 4))

In [18]:
lemmatizer = WordNetLemmatizer()

def clean(text):
    words = []
    
    text = text.lower()
    text = re.sub(r'https?:\/\/.*\/\w*', '', text) # Remove hyperlinks 
    text = re.sub(r'\&\w*;', '', text) # Remove html special entities     
    text = re.sub(r'\s\s+', ' ', text) # Remove whitespace       

    word = text.split()
    for i in word: 
        w = lemmatizer.lemmatize(i)
        words.append(w)      

    sent = ' '.join(words)
    
    return sent 

In [19]:
df_tennis['title'] = df_tennis['title'].apply(clean)
df_pickleball['title'] = df_pickleball['title'].apply(clean)

In [20]:
df_tennis['selftext'] = df_tennis['selftext'].apply(clean)
df_pickleball['selftext'] = df_pickleball['selftext'].apply(clean)

In [21]:
# save to file
df_tennis.to_csv('./data/tennis_cleaned.csv')
df_pickleball.to_csv('./data/pickleball_cleaned.csv')