# Project 3 - Web APIs and Natural Language Processing

## Running a Naive Bayes Regression 

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#### 1. Splitting the data into train and test

In [32]:
# Reading the data into a dataframe 
he_cg = pd.read_csv('./clean_subreddit.csv')

In [33]:
# Defining X and y 
X = he_cg['full_text']

In [34]:
y = he_cg['subreddit']

In [35]:
# Before splitting the data I want to see if classes are imbalanced. They are not
he_cg['subreddit'].value_counts()

careerguidance     9887
highereducation    9141
Name: subreddit, dtype: int64

In [36]:
# Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [37]:
X_train.head()

8918     Guidance, I need lots of it.AND0        Im in ...
10638    Advice Sought: Manipulative Faculty MemberAND0...
16069    The Least Important Question You Can Ask A Col...
5405     Can anyone help out an early teen ?AND0       ...
12036    Creating a Cost-Effective Financial Literacy L...
Name: full_text, dtype: object

In [38]:
X_test.head()

2141     Not pursuing my degree for two years? ( Human ...
18718    #sayyesAND0        Im in my final year of my u...
980      Career Path?AND0        Im in my final year of...
326      Leave my position for a similar role at a diff...
8398     Occupational health and safety, is it worth ta...
Name: full_text, dtype: object

In [39]:
y_train.head()

8918      careerguidance
10638    highereducation
16069    highereducation
5405      careerguidance
12036    highereducation
Name: subreddit, dtype: object

In [40]:
y_test.head()

2141      careerguidance
18718    highereducation
980       careerguidance
326       careerguidance
8398      careerguidance
Name: subreddit, dtype: object

In [41]:
# Using the CountVectorizer to vectorize each row. 
vectorizer = CountVectorizer()

In [42]:
# Fitting the vectorizer to my data 
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [43]:
# Transforming the data 
X_train = vectorizer.transform(X_train)

In [44]:
# Turning the data into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names())
X_train_df.shape

(13319, 12882)

In [45]:
# Doing the same as above, but for the test data 
X_test = vectorizer.transform(X_test)

In [46]:
X_test_df = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names())

X_test_df.shape

(5709, 12882)

In [47]:
# Thanks to Michael Daugherty who very generously shared his code with me and told me what I needed 
# to import to make it happen
mult_bayes = MultinomialNB()

In [48]:
# Fiting the model to the train data 
mult_bayes.fit(X_train_df, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
# Evaluating the score on the train data 
mult_bayes.score(X_train_df, y_train)

0.9544260079585555

In [50]:
# Evaluating the score on the test data 
mult_bayes.score(X_test_df, y_test)

0.9260816255035909