# Project 3 - Web APIs and Natural Language Processing

## Running a logistic regression 

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression

#### 1. Splitting the data into train and test

In [2]:
# Reading the data into a dataframe 
he_cg = pd.read_csv('./clean_subreddit.csv')

In [3]:
# Checking the data
he_cg.head(1)

Unnamed: 0,author,created_utc,media_only,num_comments,score,selftext,subreddit,title,full_text
0,KAMI_aka,1580305052,False,0,1,Im in my final year of my undergraduate degree...,careerguidance,Can I pursue a master's in engineering managem...,Can I pursue a master's in engineering managem...


In [4]:
he_cg.shape

(19028, 9)

In [5]:
he_cg.columns

Index(['author', 'created_utc', 'media_only', 'num_comments', 'score',
       'selftext', 'subreddit', 'title', 'full_text'],
      dtype='object')

In [6]:
# Confirming all datatypes are as I need them 
he_cg.dtypes

author          object
created_utc      int64
media_only        bool
num_comments     int64
score            int64
selftext        object
subreddit       object
title           object
full_text       object
dtype: object

In [7]:
# Created_utc is an identifier and, therfore, should be an object 
he_cg['created_utc'] = he_cg['created_utc'].astype('object')

In [8]:
he_cg.dtypes

author          object
created_utc     object
media_only        bool
num_comments     int64
score            int64
selftext        object
subreddit       object
title           object
full_text       object
dtype: object

In [9]:
# Defining X and y 
X = he_cg['full_text']

In [10]:
y = he_cg['subreddit']

In [11]:
# Before splitting the data I want to see if classes are imbalanced. They are not
he_cg['subreddit'].value_counts()

careerguidance     9887
highereducation    9141
Name: subreddit, dtype: int64

In [12]:
# Splitting the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Mnaking sure the data is organizaed the way I need it 
X_train.head()

8918     Guidance, I need lots of it.AND0        Im in ...
10638    Advice Sought: Manipulative Faculty MemberAND0...
16069    The Least Important Question You Can Ask A Col...
5405     Can anyone help out an early teen ?AND0       ...
12036    Creating a Cost-Effective Financial Literacy L...
Name: full_text, dtype: object

In [14]:
X_test.head()

2141     Not pursuing my degree for two years? ( Human ...
18718    #sayyesAND0        Im in my final year of my u...
980      Career Path?AND0        Im in my final year of...
326      Leave my position for a similar role at a diff...
8398     Occupational health and safety, is it worth ta...
Name: full_text, dtype: object

In [15]:
y_train.head()

8918      careerguidance
10638    highereducation
16069    highereducation
5405      careerguidance
12036    highereducation
Name: subreddit, dtype: object

In [16]:
y_test.head()

2141      careerguidance
18718    highereducation
980       careerguidance
326       careerguidance
8398      careerguidance
Name: subreddit, dtype: object

In [17]:
# Using the CountVectorizer to vectorize each row. 
vectorizer = CountVectorizer()

In [18]:
# Fitting the vectorizer to my data 
vectorizer.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [19]:
# Transforming the train data 
X_train = vectorizer.transform(X_train)

In [20]:
# Turning the data into a dataframe
X_train_df = pd.DataFrame(X_train.toarray(), columns = vectorizer.get_feature_names())
X_train_df.shape

(13319, 12882)

In [21]:
# Transforming the test data 
X_test = vectorizer.transform(X_test)

In [22]:
# Turning test data into a dataframe 
X_test_df = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names())

X_test_df.shape

(5709, 12882)

In [26]:
# Instantiating the model 
lr = LogisticRegression()

In [27]:
# Fitting the model to the train data 
lr.fit(X_train_df, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
# Evaluating the model on the train data 
lr.score(X_train_df, y_train)

0.9770253021998648

In [29]:
# Evaluating the model on the test data 
lr.score(X_test_df, y_test)

0.9332632685233841