### DSPT6 - Adding Data Science to a Web Application

The purpose of this notebook is to demonstrate:
- Simple online analysis of data from a user of the Twitoff app or an API
- Train a more complicated offline model, and serialize the results for online use

In [46]:
import sqlite3
import pickle
import pandas as pd
import numpy as np
import sklearn
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [3]:
# Connect to sqlite database
conn = sqlite3.connect('/home/terrence/repos/lambda/twitter_web_app/bruno_demo/twitoff_demo.sqlite3')

In [5]:
def get_data(query, conn):
    '''Function to get data from SQLite DB'''
    
    cursor = conn.cursor()
    result = cursor.execute(query).fetchall()

    # Get columns from cursor object
    columns = list(map(lambda x: x[0], cursor.description))

    # Assign to DataFrame
    df = pd.DataFrame(data=result, columns=columns)
    return df

In [6]:
sql = '''
SELECT
tweet.tweet,
tweet.embedding,
tweet.user_id,
user.username
FROM tweet
JOIN user
ON tweet.user_id = user.Id;
'''

df = get_data(sql, conn)
['decoded_embedding'] = df.embedding.apply(lambda x: pickle.loads(x))

In [23]:
df.head(3)

Unnamed: 0,tweet,embedding,user_id,username,decoded_embedding
0,"Worth a read, I liked this Economist article a...",b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,50393960,billgates,"[-0.07537176, 0.27606362, -0.0547825, 0.014415..."
1,"Another interesting article, this one on a pro...",b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,50393960,billgates,"[-0.14114963, 0.17626844, -0.18227336, 0.10240..."
2,The Guardian has partnered with the foundation...,b'\x80\x04\x95\xee\x11\x00\x00\x00\x00\x00\x00...,50393960,billgates,"[-0.002723366, 0.10334747, -0.100752264, 0.025..."


In [21]:
df.username.value_counts()

billgates      2889
barackobama    2766
jimmyfallon    2353
KingJames      2193
nasa           1693
elonmusk        397
Name: username, dtype: int64

In [31]:
user1_embeddings = df.decoded_embedding[df.username=='barackobama']
user2_embeddings = df.decoded_embedding[df.username=='jimmyfallon']

embeddings_to_test = pd.concat([user1_embeddings,user2_embeddings])

print(user1_embeddings.shape, embeddings_to_test.shape)

(2766,) (5119,)


In [35]:
embeddings_df = pd.DataFrame(embeddings_to_test.to_list(),
                            columns=[f'dim{i}' for i in range(300)])
labels = np.concatenate([np.ones(len(user1_embeddings)), np.zeros(len(user2_embeddings))])

print(embeddings_df.shape, labels.shape)

(5119, 300) (5119,)


In [40]:

X_train, X_test, y_train, y_test = train_test_split(embeddings_df, labels, 
                                                    test_size=.25, random_state=42)
print(X_train.shape, X_test.shape)

(3839, 300) (1280, 300)


In [41]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [44]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.95      0.95       564
         1.0       0.96      0.96      0.96       716

    accuracy                           0.95      1280
   macro avg       0.95      0.95      0.95      1280
weighted avg       0.95      0.95      0.95      1280



In [49]:
nlp = spacy.load('en_core_web_md', disable=['tagger', 'parser'])

def vec_tweet(nlp, tweet_text):
    return list(nlp(tweet_text).vector)

In [50]:
new_embedding = vec_tweet(nlp, 'my next guest')

In [52]:
model.predict([new_embedding])

array([0.])

In [59]:
pickle.dump(model, open("../models/model.pkl", "wb"))