In [1]:
import re
import unicodedata
import pandas as pd
import nltk
import prepare
import acquire
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from env import user, password, host

def get_db_url(database, host=host, user=user, password=password):
    return f'mysql+pymysql://{user}:{password}@{host}/{database}'
from wordcloud import WordCloud

url = get_db_url("spam_db")
sql = "SELECT * FROM spam"

df = pd.read_sql(sql, url, index_col="id")

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/samkeeler/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/samkeeler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Viewing the spread of spam vs. non-spam

df.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [3]:
# Using prepare.py file function to clean and lemmatize text data. Removing unnecessary columns from the created
# dataframe and renaming the remaining column to 'text'

df = prepare.make_prepped_columns(df, 'text')
df.drop(columns = ['text', 'clean', 'stemmed'], inplace = True)
df.rename(columns = {'lemmatized': 'text'}, inplace = True)

In [4]:
df

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don ' t think he go to usf he life aroun...
...,...,...
5567,spam,this is the 2nd time we have tried 2 contact u...
5568,ham,will i b going to esplanade fr home
5569,ham,pity wa in mood for that soany other suggestion
5570,ham,the guy did some bitching but i acted like i '...


In [5]:
# Creating a function to split the data into train, validate, test

def split(df, stratify_by=None):
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [6]:
# Splitting the data

train, validate, test = split(df, stratify_by='label')

In [7]:
# Specifying the data to model off of

X_train = train.text
X_validate = validate.text
X_test = test.text

In [8]:
# Specifying the labels that I am trying to predict

y_train = train.label
y_validate = validate.label
y_test = test.label

In [9]:
# Creating a vectorizer object 

tfidf = TfidfVectorizer()

# Fitting that object onto the train data

tfidf.fit(X_train)

# Applying the vector transformer to each data set

X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [None]:
X_train

In [None]:
# Creating and fitting the modeling object

lm = LogisticRegression()
lm.fit(X_train_vectorized, y_train)

In [None]:
# Creating a dataframe that will hold predicted and actual values for evaluation metrics

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [None]:
# Applying the modeling object to predict

train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [None]:
# Train accuracy

(train.actual == train.predicted).mean()

In [None]:
# Validate accuracy

(validate.actual == validate.predicted).mean()

In [None]:
# Test accuracy

(test.actual == test.predicted).mean()

## Breaking Bad

In [None]:
def acquire_script():
    script = pd.read_csv('breaking_bad_dataset.csv')
    script.drop(columns = ['Unnamed: 0'], inplace = True)
    return script

In [None]:
script = acquire_script()
script.head()

In [None]:
def prep_script(script):
    script = script.replace(r'\[.*?\]', '',regex=True)
    script['Text'] = script['Text'].apply(lambda x: str(x))
    script = prepare.make_prepped_columns(script, 'Text')
    script.drop(columns = ['Text', 'clean', 'stemmed'], inplace = True)
    script.rename(columns = {'lemmatized': 'text'}, inplace = True)
    return script

In [None]:
script = prep_script(script)
script.head()

In [None]:
script.Speaker.value_counts()

In [None]:
train, validate, test = split(script, stratify_by = 'Speaker')

In [None]:
X_train = train.text
X_validate = validate.text
X_test = test.text

In [None]:
y_train = train.Speaker
y_validate = validate.Speaker
y_test = test.Speaker

In [None]:
# Creating a vectorizer object 

tfidf = TfidfVectorizer()

# Fitting that object onto the train data

tfidf.fit(X_train)

# Applying the vector transformer to each data set

X_train_vectorized = tfidf.transform(X_train)
X_validate_vectorized = tfidf.transform(X_validate)
X_test_vectorized = tfidf.transform(X_test)

In [None]:
# Creating and fitting the modeling object

lm = LogisticRegression()
lm.fit(X_train_vectorized, y_train)

In [None]:
# Creating a dataframe that will hold predicted and actual values for evaluation metrics

train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [None]:
# Applying the modeling object to predict

train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [None]:
# Train accuracy

(train.actual == train.predicted).mean()

In [None]:
pd.concat([train, X_train], axis = 1)

In [None]:
X_train

In [None]:
train.actual.value_counts()

In [None]:
script.loc[1217]

In [None]:
# After reviewing I found this data set is not accurate and speakers aren't always correct:(

## Article predictor

In [None]:
articles = acquire.get_all_inshorts(["business", "sports", "technology", "entertainment", "science", "world"])