In [1]:
import pandas as pd

In [2]:
#Create a dataframe from the CSV
df = pd.read_csv("south_park_dialogue.zip")
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [3]:
#remove \n from Line column
df['Line'] = df['Line'].replace({'\n':''}, regex=True)
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away."
1,10,1,Kyle,Going away? For how long?
2,10,1,Stan,Forever.
3,10,1,Chef,I'm sorry boys.
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


In [4]:
#Remove season and episode columns since these are not needed for our task
character_line_df = df.drop(columns=['Season','Episode'])
character_line_df.head()

Unnamed: 0,Character,Line
0,Stan,"You guys, you guys! Chef is going away."
1,Kyle,Going away? For how long?
2,Stan,Forever.
3,Chef,I'm sorry boys.
4,Stan,"Chef said he's been bored, so he joining a gro..."


In [5]:
#import stopwords
from nltk.corpus import stopwords

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
#using regex tokenizer to tokenize and remove special characters
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

In [8]:
character_line_df.head()

Unnamed: 0,Character,Line
0,Stan,"You guys, you guys! Chef is going away."
1,Kyle,Going away? For how long?
2,Stan,Forever.
3,Chef,I'm sorry boys.
4,Stan,"Chef said he's been bored, so he joining a gro..."


In [11]:
#lower case dataframe, tokenize, and apply stopwords
#lower_df= character_line_df.apply(lambda x: x.str.lower())
#lower_df['Line'] = lower_df['Line'].apply(tokenizer.tokenize)
#lower_df['Line'] = lower_df['Line'].apply(lambda x: [item for item in x if item not in stop_words])
#lower_df.columns=['character','line']

In [12]:
#lower_df.head()

In [13]:
#find characters with most lines
top_characters = character_line_df.groupby(['Character']).size().loc[character_line_df.groupby(['Character']).size() > 999]
top_characters
main_characters = ['Butters','Cartman','Kyle','Mr. Garrison', 'Randy', 'Stan']

In [14]:
character_line_df.head()

Unnamed: 0,Character,Line
0,Stan,"You guys, you guys! Chef is going away."
1,Kyle,Going away? For how long?
2,Stan,Forever.
3,Chef,I'm sorry boys.
4,Stan,"Chef said he's been bored, so he joining a gro..."


In [15]:
#filter dataframe
filtered_df = character_line_df.loc[character_line_df['Character'].isin(main_characters)]
#filtered_df.columns('character','line')
filtered_df['Line'] = filtered_df['Line'].str.replace('[^\w\s]','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [20]:
filtered_df.head()

Unnamed: 0,Character,Line
0,Stan,You guys you guys Chef is going away
1,Kyle,Going away For how long
2,Stan,Forever
4,Stan,Chef said hes been bored so he joining a group...
9,Cartman,Im gonna miss him Im gonna miss Chef and Iand...


In [17]:
from sklearn.model_selection import train_test_split

#filtered_df['Line'] = [line.replace('\n','') for line in filtered_df['Line']]
train, test = train_test_split(filtered_df, test_size=0.3, random_state=14)
test.head()

Unnamed: 0,Character,Line
21669,Cartman,No I just dont have the mojo you have Butters
59521,Cartman,Yeah but not if you guys have one too Now its ...
28722,Cartman,Scuzzlebutt is a creature that lives up on thi...
5640,Stan,Gee thanks a lot Dad
46081,Kyle,Bleuhleuhleuhleuhleuh


In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
from nltk.stem.lancaster import LancasterStemmer

st = LancasterStemmer()
def token(text):
    txt = nltk.word_tokenize(text)
    return [st.stem(word) for word in txt]


stop = set(stopwords.words("english"))
cv = CountVectorizer(lowercase=True, 
                     tokenizer=token, stop_words=stop,
                     analyzer=u'word', min_df=4)
#print(train['Line'].tolist())

vec_train = cv.fit_transform(train['Line'].tolist())
vec_test = cv.transform(test['Line'].tolist())


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
rf.fit(X = vec_train, y = train['Character'])

accuracy_score(rf.predict(vec_test), test['Character'])

0.4170657379190248