In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import re
import sys
import json
from IPython.display import Image, display
os.chdir('..')
import pickle
import numpy as np

In [None]:
with open('config/data-params.json') as fh:
    data_cfg = json.load(fh)

# Training Data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

In [None]:
dim = 'moderacy'

In [None]:
df = pd.read_csv(os.path.join(data_cfg['output_data_path'], 'data.csv')).drop(columns=['Unnamed: 0'])
def remove_hashtags_and_ats(x):
    return x.replace('#', '').replace('@', '')
df['text'] = df['text'].apply(remove_hashtags_and_ats)
df.head()

In [None]:
df.shape[0]

In [None]:
df[dim].value_counts()

# Classifier

In [None]:
# political_right use min_df = .00001

In [None]:
count_vect = CountVectorizer(stop_words='english', max_df=0.3, min_df=0.00001)
tfidf_transformer = TfidfTransformer()
clf = MultinomialNB(fit_prior=False)

In [None]:
text_clf = Pipeline([('vect', count_vect),
                     ('tfidf', tfidf_transformer),
                     ('clf', clf)])

Cross validation score

In [None]:
scores = cross_val_score(text_clf, df['text'], df[dim], cv=3)
scores

Check errors

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df[dim], test_size=0.1)

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
y_pred = text_clf.predict(X_test)

In [None]:
np.mean(abs(y_pred - y_test))

In [None]:
text_clf.fit(df['text'], df[dim])

# Regressor

In [None]:
count_vect = CountVectorizer(stop_words='english', max_df=0.3, min_df=0.01)
tfidf_transformer = TfidfTransformer()
reg = Ridge(alpha=1.0)

In [None]:
text_reg = Pipeline([('vect', count_vect),
                     ('tfidf', tfidf_transformer),
                     ('reg', reg)])

In [None]:
cross_val_score(text_reg, df['text'], df[dim], cv=5)

# User Test Data

In [None]:
# Reading in user tweets
tweets = {}
for tweet_id in data_cfg['tweet_ids']:
    path = os.path.join(data_cfg['output_user_data_path'], 'tweet_{}.csv'.format(tweet_id))
    tweet = pickle.load(open(path, 'rb'))
    tweets[tweet_id] = tweet
    for key, value in tweets.items():
        for user_id in list(value['user_ids'].keys()):
            value['user_ids'][user_id] = pd.read_csv(os.path.join(data_cfg['output_user_data_path'], 'user_{}_tweets.csv'.format(user_id)))

In [None]:
# # Going through all the users of all the tweets
# for tweet_id in tweets.keys():
#     user_ids = list(tweets[tweet_id]['user_ids'].keys())
#     for user_id in user_ids:
#         df = pd.read_csv(os.path.join(data_cfg['output_user_data_path'], 'user_{}_tweets.csv'.format(user_id)))
#         def remove_hashtags_and_ats(x):
#             return x.replace('#', '').replace('@', '')
#         df['text'] = df['text'].apply(remove_hashtags_and_ats)
#         if df.shape[0] != 0:
#             print('user_id: {}'.format(user_id))
#             print(text_clf.predict_proba(df['text']))

In [None]:
for tweet_id in data_cfg['tweet_ids']:
    user_ids = list(tweets[tweet_id]['user_ids'].keys())
    print('Tweet ID: {}'.format(tweet_id))
    print(user_ids)
    print()
    print()

In [None]:
user_id = '1061497789'
df = pd.read_csv(os.path.join(data_cfg['output_user_data_path'], 'user_{}_tweets.csv'.format(user_id)))
def remove_hashtags_and_ats(x):
    return x.replace('#', '').replace('@', '')
df['text'] = df['text'].apply(remove_hashtags_and_ats)
display(df['text'])
if df.shape[0] != 0:
    print('user_id: {}'.format(user_id))
    print(text_clf.predict(df['text']))