In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set()

## 1 - Open training data

In [2]:
train_data = "../data/reddit_train.csv"
train = pd.read_csv(train_data)
train.head()

Unnamed: 0,id,comments,subreddits
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey
1,1,Ah yes way could have been :( remember when he...,nba
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends
3,3,He wouldn't have been a bad signing if we woul...,soccer
4,4,Easy. You use the piss and dry technique. Let ...,funny


## 2 - Basic Feature Extraction
 

### Number of Words

In [3]:
train['word_count'] = train['comments'].apply(lambda x: len(str(x).split(" ")))
train.head()

Unnamed: 0,id,comments,subreddits,word_count
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58
1,1,Ah yes way could have been :( remember when he...,nba,29
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18
3,3,He wouldn't have been a bad signing if we woul...,soccer,24
4,4,Easy. You use the piss and dry technique. Let ...,funny,46


### 2.1 Number of Characters

In [4]:
train['char_count'] = train['comments'].str.len() ## this also includes spaces
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357
1,1,Ah yes way could have been :( remember when he...,nba,29,145
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212


### 2.2 Removing the stopwords

In [6]:
# do this if its the first time using nltk stopwords
import nltk
nltk.download('stopwords') 

[nltk_data] Downloading package stopwords to /home/naysan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
from nltk.corpus import stopwords

stop = stopwords.words('english')

train['stopwords'] = train['comments'].apply(lambda x: len([x for x in x.split() if x in stop]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17


### 2.3 Does the comment contain a url ?

In [8]:
train['links'] = train['comments'].apply(lambda x: True if "http" in x else False)
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False


### 2.4 Number of Numerics

In [9]:
train['numerics'] = train['comments'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False,1
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False,0
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True,0
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False,0


### 2.5 Number of Uppercase words

In [12]:
train['upper'] = train['comments'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,"Honestly, Buffalo is the correct answer. I rem...",hockey,58,357,20,False,1,2
1,1,Ah yes way could have been :( remember when he...,nba,29,145,12,False,0,1
2,2,https://youtu.be/6xxbBR8iSZ0?t=40m49s\n\nIf yo...,leagueoflegends,18,145,9,True,0,0
3,3,He wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0,1
4,4,Easy. You use the piss and dry technique. Let ...,funny,46,212,17,False,0,0


## 3 - Basic Pre-processing

In [14]:
# Keep a copy of the unprocessed training dataframe just in case
unprocessed_train = train.copy()

### 3.1 - Lowercase

In [17]:
train['comments'] = train['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,"honestly, buffalo is the correct answer. i rem...",hockey,58,357,20,False,1,2
1,1,ah yes way could have been :( remember when he...,nba,29,145,12,False,0,1
2,2,https://youtu.be/6xxbbr8isz0?t=40m49s if you d...,leagueoflegends,18,145,9,True,0,0
3,3,he wouldn't have been a bad signing if we woul...,soccer,24,123,12,False,0,1
4,4,easy. you use the piss and dry technique. let ...,funny,46,212,17,False,0,0


### 3.2 Removing Punctuation

In [18]:
train['comments'] = train['comments'].str.replace('[^\w\s]','')
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestly buffalo is the correct answer i remem...,hockey,58,357,20,False,1,2
1,1,ah yes way could have been remember when he w...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49s if you didnt fi...,leagueoflegends,18,145,9,True,0,0
3,3,he wouldnt have been a bad signing if we would...,soccer,24,123,12,False,0,1
4,4,easy you use the piss and dry technique let a ...,funny,46,212,17,False,0,0


### 3.2 Removal of Stop Words

In [19]:
train['comments'] = train['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestly buffalo correct answer remember peopl...,hockey,58,357,20,False,1,2
1,1,ah yes way could remember drafted thought gonn...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49s didnt find alre...,leagueoflegends,18,145,9,True,0,0
3,3,wouldnt bad signing wouldnt paid 18m euros rig...,soccer,24,123,12,False,0,1
4,4,easy use piss dry technique let drops let dry ...,funny,46,212,17,False,0,0


### 3.3 Spelling correction - WARNING: takes a very long time!

In [20]:
from textblob import TextBlob
train['comments'].apply(lambda x: str(TextBlob(x).correct()))
train.head()

Unnamed: 0,id,comments,subreddits,word_count,char_count,stopwords,links,numerics,upper
0,0,honestly buffalo correct answer remember peopl...,hockey,58,357,20,False,1,2
1,1,ah yes way could remember drafted thought gonn...,nba,29,145,12,False,0,1
2,2,httpsyoutube6xxbbr8isz0t40m49s didnt find alre...,leagueoflegends,18,145,9,True,0,0
3,3,wouldnt bad signing wouldnt paid 18m euros rig...,soccer,24,123,12,False,0,1
4,4,easy use piss dry technique let drops let dry ...,funny,46,212,17,False,0,0


In [10]:
from textblob import TextBlob
x = 'transfering'
TextBlob(x).correct()

TextBlob("transferring")