In [None]:
import pandas as pd
from textblob import TextBlob

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/M2_MLSD/PPD/cleaned_resultats.csv', on_bad_lines='skip', usecols=['state', 'cleaned_texts'], nrows = 1000)
df.shape

(1000, 2)

In [None]:
df = df.dropna()
df.shape

(962, 2)

In [None]:
df = df[df.state != 'USA']
df.shape

(803, 2)

In [None]:
df.head()

Unnamed: 0,state,cleaned_texts
0,NJ,Don’t drink ladder
1,NJ,We expect relapse hopefully contained smaller ...
2,TX,Want help great profit local small tshirt busi...
3,NY,Me torturing wife quarantine quarantine stay...
4,CA,Angela Merkel declares coronavirus biggest cha...


# Sentiments analysis

In [None]:
#Create a function to get the polarity
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

In [None]:
df['sentiment'] = df['cleaned_texts'].apply(getPolarity)
df['sentiment_analysis'] = df['sentiment'].apply(getAnalysis)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis
0,NJ,Don’t drink ladder,0.0,Neutral
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative


In [None]:
df['Total'] = 1
df_cat = df.pivot(columns='sentiment_analysis', values='Total')
df_cat = df_cat.fillna(0)
df = pd.concat([df, df_cat], axis=1)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive
0,NJ,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0


# LDA

In [None]:
import gensim
from gensim.utils import simple_preprocess

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [None]:
data_sent = df["cleaned_texts"].values.tolist()
data_words = list(sent_to_words(data_sent))
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# LDA model training
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                    id2word=id2word,
                                    num_topics=num_topics)

In [None]:
def getLDA(text):
  new_text_corpus =  id2word.doc2bow(text.split())
  liste = lda_model[new_text_corpus]
  return max(liste, key=lambda x:x[1])[0]

df['LDA'] = df['cleaned_texts'].apply(getLDA)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA
0,NJ,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,6
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,0
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,5
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0


# NMF

In [None]:
# Create Document Term Matrix ‘V’

from sklearn.feature_extraction.text import TfidfVectorizer

tv_noun = TfidfVectorizer(ngram_range = (1,1), max_df = .8, min_df = .01)

# Fit and Transform speech noun text to a TF-IDF Doc-Term Matrix
data_tv_noun = tv_noun.fit_transform(df.cleaned_texts)

# Create data-frame of Doc-Term Matrix with nouns as column names
data_dtm_noun = pd.DataFrame(data_tv_noun.toarray(), columns=tv_noun.get_feature_names())

# Set President's Names as Index
data_dtm_noun.index = df.index

# Visually inspect Document Term Matrix
data_dtm_noun.head()



Unnamed: 0,all,americans,amp,and,anyone,back,bad,better,big,but,...,well,what,who,work,worker,world,would,wuhan,year,you
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.564144,0.0,0.0,0.0,0.0


In [None]:
# Run NMF on Document Term Matrix ‘V’
from sklearn.decomposition import NMF        

nmf_model = NMF(10)

# Learn an NMF model for given Document Term Matrix 'V' 
# Extract the document-topic matrix 'W'
doc_topic = nmf_model.fit_transform(data_dtm_noun)



In [None]:
import numpy as np

def myfunction(x):
  return np.where(x == max(x))[0][0]

In [None]:
df['NMF'] = np.apply_along_axis(myfunction, axis=1, arr=doc_topic)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA,NMF
0,NJ,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,6,9
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0,6
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,0,1
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,5,1
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0,8


# Tableau

In [None]:
df_lda = df.pivot(columns='LDA', values='Total')
df_lda = df_lda.add_prefix('LDA_')
df_lda = df_lda.fillna(0)
df = pd.concat([df, df_lda], axis=1)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,LDA_1,LDA_2,LDA_3,LDA_4,LDA_5,LDA_6,LDA_7,LDA_8,LDA_9
0,NJ,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,6,9,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0,6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,5,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0,8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_nmf = df.pivot(columns='NMF', values='Total')
df_nmf = df_nmf.add_prefix('NMF_')
df_nmf = df_nmf.fillna(0)
df = pd.concat([df, df_nmf], axis=1)
df.head()

Unnamed: 0,state,cleaned_texts,sentiment,sentiment_analysis,Total,Negative,Neutral,Positive,LDA,NMF,...,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9
0,NJ,Don’t drink ladder,0.0,Neutral,1,0.0,1.0,0.0,6,9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,NJ,We expect relapse hopefully contained smaller ...,0.0,Neutral,1,0.0,1.0,0.0,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,TX,Want help great profit local small tshirt busi...,0.3375,Positive,1,0.0,0.0,1.0,0,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NY,Me torturing wife quarantine quarantine stay...,-0.181818,Negative,1,1.0,0.0,0.0,5,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CA,Angela Merkel declares coronavirus biggest cha...,-0.2,Negative,1,1.0,0.0,0.0,0,8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
df_state = df.groupby(['state'], as_index=False).sum()
df_temp = df.groupby(['state'], as_index=False).agg({'sentiment': 'mean'})
df_state['sentiment'] = df_temp['sentiment']
df_state.head()

Unnamed: 0,state,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,LDA_1,...,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9
0,AK,0.0,1,0.0,1.0,0.0,6,8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,AL,-0.075893,7,3.0,2.0,2.0,21,39,1.0,2.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
2,AR,0.016667,2,0.0,1.0,1.0,9,9,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,AZ,0.131933,15,2.0,5.0,8.0,71,49,2.0,0.0,...,6.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0
4,CA,0.009743,138,35.0,54.0,49.0,584,625,21.0,19.0,...,29.0,8.0,10.0,10.0,7.0,11.0,10.0,18.0,17.0,18.0


In [None]:
df_state = df_state.append(df.sum(numeric_only=True), ignore_index=True) 
df_state['state'] = df_state['state'].fillna('USA')
df_state.head()

Unnamed: 0,state,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,LDA_1,...,NMF_0,NMF_1,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9
0,AK,0.0,1.0,0.0,1.0,0.0,6.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,AL,-0.075893,7.0,3.0,2.0,2.0,21.0,39.0,1.0,2.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
2,AR,0.016667,2.0,0.0,1.0,1.0,9.0,9.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,AZ,0.131933,15.0,2.0,5.0,8.0,71.0,49.0,2.0,0.0,...,6.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0
4,CA,0.009743,138.0,35.0,54.0,49.0,584.0,625.0,21.0,19.0,...,29.0,8.0,10.0,10.0,7.0,11.0,10.0,18.0,17.0,18.0


In [None]:
from sklearn.feature_extraction import text 

my_stop_words = text.ENGLISH_STOP_WORDS.union(["covid","coronavirus","corona","virus"])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words = my_stop_words, max_features=1000)
X = vec.fit_transform(df['cleaned_texts'].to_list())
temp = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
temp['state'] = df['state']
df_words = pd.DataFrame() 
df_words[['words','freq']] = temp.groupby(['state']).sum().apply(lambda x: pd.Series([x.nlargest(10).index.values, x.nlargest(10).values]), axis=1)

temp2 = pd.DataFrame()
temp2['words'] = temp.sum().to_frame().apply(lambda x: pd.Series([x.nlargest(10).index.values]), axis=0)
temp2['freq'] = temp.sum().to_frame().apply(lambda x: pd.Series([x.nlargest(10).values]), axis=0)
temp2['state'] = 'USA'
temp2 = temp2.set_index('state')
df_words = df_words.append(temp2)
df_words = df_words.reset_index(level=0)
df_words.head()

  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,state,words,freq
0,AK,"[age, country, dead, joe, able, absolute, abso...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
1,AL,"[like, texas, accurate, af, business, closing,...","[2, 2, 1, 1, 1, 1, 1, 1, 1, 1]"
2,AR,"[fucking, medium, source, thread, able, absolu...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
3,AZ,"[di, die, feel, home, ill, let, nurse, soon, s...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
4,CA,"[home, amp, day, don, people, new, china, im, ...","[8, 7, 7, 7, 7, 6, 5, 5, 5, 5]"


In [None]:
df_state['words'] = df_words['words']
df_state['freq'] = df_words['freq']
df_state.head()

Unnamed: 0,state,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,LDA_1,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,AK,0.0,1.0,0.0,1.0,0.0,6.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,"[age, country, dead, joe, able, absolute, abso...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
1,AL,-0.075893,7.0,3.0,2.0,2.0,21.0,39.0,1.0,2.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,"[like, texas, accurate, af, business, closing,...","[2, 2, 1, 1, 1, 1, 1, 1, 1, 1]"
2,AR,0.016667,2.0,0.0,1.0,1.0,9.0,9.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"[fucking, medium, source, thread, able, absolu...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
3,AZ,0.131933,15.0,2.0,5.0,8.0,71.0,49.0,2.0,0.0,...,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,"[di, die, feel, home, ill, let, nurse, soon, s...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
4,CA,0.009743,138.0,35.0,54.0,49.0,584.0,625.0,21.0,19.0,...,10.0,10.0,7.0,11.0,10.0,18.0,17.0,18.0,"[home, amp, day, don, people, new, china, im, ...","[8, 7, 7, 7, 7, 6, 5, 5, 5, 5]"


In [None]:
states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming'
}

In [None]:
df_state.insert(1, 'state_name', 0)
df_state['state_name'] = df_state['state'].map(lambda x: states.get(x, x))

In [None]:
lda = df.groupby(['state','LDA'])['Total'].size().reset_index(level='state').groupby('state')['Total'].idxmax().to_frame()
liste_lda = lda['Total'].values
liste_lda = np.append(liste_lda,0)
df_state['LDA'] = liste_lda
df_state['LDA'] = df_state['LDA'].apply(str)

nmf = df.groupby(['state','NMF'])['Total'].size().reset_index(level='state').groupby('state')['Total'].idxmax().to_frame()
liste_nmf = nmf['Total'].values
liste_nmf = np.append(liste_nmf,0)
df_state['NMF'] = liste_nmf
df_state['NMF'] = df_state['NMF'].apply(str)

In [None]:
df_state

Unnamed: 0,state,state_name,sentiment,Total,Negative,Neutral,Positive,LDA,NMF,LDA_0,...,NMF_2,NMF_3,NMF_4,NMF_5,NMF_6,NMF_7,NMF_8,NMF_9,words,freq
0,AK,Alaska,0.0,1.0,0.0,1.0,0.0,6,8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,"[age, country, dead, joe, able, absolute, abso...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
1,AL,Alabama,-0.075893,7.0,3.0,2.0,2.0,1,9,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,"[like, texas, accurate, af, business, closing,...","[2, 2, 1, 1, 1, 1, 1, 1, 1, 1]"
2,AR,Arkansas,0.016667,2.0,0.0,1.0,1.0,4,2,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"[fucking, medium, source, thread, able, absolu...","[1, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
3,AZ,Arizona,0.131933,15.0,2.0,5.0,8.0,5,0,2.0,...,2.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,"[di, die, feel, home, ill, let, nurse, soon, s...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]"
4,CA,California,0.009743,138.0,35.0,54.0,49.0,0,0,21.0,...,10.0,10.0,7.0,11.0,10.0,18.0,17.0,18.0,"[home, amp, day, don, people, new, china, im, ...","[8, 7, 7, 7, 7, 6, 5, 5, 5, 5]"
5,CO,Colorado,0.048611,6.0,2.0,2.0,2.0,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,"[america, amp, berniesanders, biggest, coronap...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
6,CT,Connecticut,0.122222,9.0,0.0,7.0,2.0,1,2,1.0,...,4.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,"[europe, americans, away, day, doesn, enjoy, e...","[2, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
7,DC,District of Columbia,-0.021554,18.0,5.0,9.0,4.0,0,7,3.0,...,1.0,2.0,1.0,0.0,1.0,5.0,0.0,1.0,"[really, trump, action, amp, dem, drug, goveno...","[3, 3, 2, 2, 2, 2, 2, 2, 2, 2]"
8,DE,Delaware,0.0,3.0,0.0,3.0,0.0,0,0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,"[amp, comfort, finding, fund, hey, music, oh, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
9,FL,Florida,0.009464,54.0,18.0,17.0,19.0,0,0,10.0,...,5.0,1.0,3.0,1.0,5.0,5.0,4.0,6.0,"[amp, life, open, people, best, need, school, ...","[4, 4, 4, 4, 3, 3, 3, 3, 2, 2]"


In [None]:
df_state.to_csv('/content/drive/MyDrive/Colab Notebooks/M2_MLSD/PPD/data_states.csv',index=False)