In [None]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
from matplotlib.pyplot import imread

# Text Mining
from nltk.corpus import stopwords
from textblob import TextBlob
from wordcloud import WordCloud,STOPWORDS

#warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
tweets = pd.read_csv("Elon_musk.csv",encoding = 'latin')
tweets

### 1. EDA

#### 1.1 Number of Words

In [None]:
tweets['word_count'] = tweets['Text'].apply(lambda x: len(str(x).split(" ")))
tweets[['Text','word_count']].head()

#### 1.2 Number of charachters

In [None]:
#Number of characters in single tweet
tweets['char_count'] = tweets['Text'].str.len() ## this also includes spaces
tweets[['Text','char_count']].head()

#### 1.3 Average Word Length 

In [None]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

tweets['avg_word'] = tweets['Text'].apply(lambda x: avg_word(x))
tweets[['Text','avg_word']].head()

#### 1.4 Number of stopwords

In [None]:
stop = stopwords.words('english')

tweets['stopwords'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x in stop]))
tweets[['Text','stopwords']].head()

#### 1.5 Number of Special Characters

In [None]:
tweets['hastags'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))
tweets[['Text','hastags']].head()

#### 1.6 Number of Numerics

In [None]:
tweets['numerics'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
tweets[['Text','numerics']].head()

#### 1.7 Number of Upper Case Words

In [None]:
tweets['upper'] = tweets['Text'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
tweets[['Text','upper']].head()

###  2. Text-processing

In [None]:
tweets=[Text.strip() for Text in tweets.Text] # remove both the leading and the trailing characters
tweets=[Text for Text in tweets if Text] # removes empty strings, because they are considered in Python as False
tweets[0:10]

In [None]:
# Joining the list into one string/text
tweets_text=' '.join(tweets)
tweets_text [0:50]

In [None]:
# remove Twitter username handles from a given twitter text. (Removes @usernames)
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
tweets_tokens=tknzr.tokenize(tweets_text)
print(tweets_tokens[0:100])

In [None]:
# Again Joining the list into one string/text
tweets_tokens_text=' '.join(tweets_tokens)
tweets_tokens_text [0:500]

In [None]:
# Remove Punctuations 
no_punc_text=tweets_tokens_text.translate(str.maketrans('','',string.punctuation))
no_punc_text [0:500]

In [None]:
# remove https or url within text
import re
no_url_text=re.sub(r'http\S+', '', no_punc_text)
no_url_text [0:500]

In [None]:
from nltk.tokenize import word_tokenize
text_tokens=word_tokenize(no_url_text)
print(text_tokens[0:150]) 

In [None]:
# Tokenization
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Tokens count
len(text_tokens)

In [None]:
# Remove Stopwords
from nltk.corpus import stopwords
my_stop_words=stopwords.words('english')

sw_list = ['\x92','rt','ye','yeah','haha','Yes','U0001F923','I']
my_stop_words.extend(sw_list)

no_stop_tokens=[word for word in text_tokens if not word in my_stop_words]
print(no_stop_tokens[0:150])

In [None]:
# Normalize the data
lower_words=[Text.lower() for Text in no_stop_tokens]
print(lower_words[100:200])

In [None]:
# Stemming (Optional)
from nltk.stem import PorterStemmer
ps=PorterStemmer()
stemmed_tokens=[ps.stem(word) for word in lower_words]
print(stemmed_tokens[100:200])

In [None]:
# Lemmatization
nlp=spacy.load('en_core_web_sm')
doc=nlp(' '.join(lower_words))
print(doc[0:150])

In [None]:
lemmas=[token.lemma_ for token in doc]
print(lemmas[0:150])

In [None]:
clean_tweets=' '.join(lemmas)
clean_tweets[0:500]

### 3.Feature Extraction

#### 3.1 Using CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
tweetscv=cv.fit_transform(lemmas)

In [None]:
print(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[100:200])

In [None]:
print(tweetscv.toarray()[100:200])

In [None]:
print(tweetscv.toarray().shape)

#### 3.2 CountVectorizer with N-grams (Bigrams & Trigrams)

In [None]:
cv_ngram_range=CountVectorizer(analyzer='word',ngram_range=(1,3),max_features=100)
bow_matrix_ngram=cv_ngram_range.fit_transform(lemmas)

In [None]:
print(cv_ngram_range.get_feature_names())
print(bow_matrix_ngram.toarray())

#### 3.3 TF-IDF Vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv_ngram_max_features=TfidfVectorizer(norm='l2',analyzer='word',ngram_range=(1,3),max_features=500)
tfidf_matix_ngram=tfidfv_ngram_max_features.fit_transform(lemmas)

In [None]:
print(tfidfv_ngram_max_features.get_feature_names())
print(tfidf_matix_ngram.toarray())

### 4. Generate Word Cloud

In [None]:
# Define a function to plot word cloud
def plot_cloud(wordcloud):
    plt.figure(figsize=(40,30))
    plt.imshow(wordcloud)
    plt.axis('off')
    
# Generate Word Cloud

STOPWORDS.add('pron')
STOPWORDS.add('rt')
STOPWORDS.add('yeah')
wordcloud=WordCloud(width=3000,height=2000,background_color='black',max_words=50,
                   colormap='Set1',stopwords=STOPWORDS).generate(clean_tweets)
plot_cloud(wordcloud)

### 5. Named Entity Recognition (NER)

In [None]:
# Parts Of Speech (POS) Tagging
nlp=spacy.load('en_core_web_sm')

one_block=clean_tweets
doc_block=nlp(one_block)
spacy.displacy.render(doc_block,style='ent',jupyter=True)

In [None]:
for token in doc_block[100:200]:
    print(token,token.pos_)

In [None]:
# Filtering the nouns and verbs only
nouns_verbs=[token.text for token in doc_block if token.pos_ in ('NOUN','VERB')]
print(nouns_verbs[100:200])

In [None]:
# Counting the noun & verb tokens
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

X=cv.fit_transform(nouns_verbs)
sum_words=X.sum(axis=0)

words_freq=[(word,sum_words[0,idx]) for word,idx in cv.vocabulary_.items()]
words_freq=sorted(words_freq, key=lambda x: x[1], reverse=True)

wd_df=pd.DataFrame(words_freq)
wd_df.columns=['word','count']
wd_df[0:10] # viewing top ten results

In [None]:
# Visualizing results (Barchart for top 10 nouns + verbs)
wd_df[0:10].plot.bar(x='word',figsize=(12,8),title='Top 10 nouns and verbs');

### 6. Emotion Mining - Sentiment Analysis

In [None]:
from nltk import tokenize
sentences=tokenize.sent_tokenize(' '.join(tweets))
print(sentences[0:150])

In [None]:
sent_df=pd.DataFrame(sentences,columns=['sentence'])
sent_df

In [None]:
# Emotion Lexicon - Affin
affin=pd.read_csv('Afinn.csv',sep=',',encoding='Latin-1')
affin

In [None]:
affinity_scores=affin.set_index('word')['value'].to_dict()
affinity_scores

In [None]:
# Custom function: score each word in a sentence in lemmatised form, but calculate the score for the whole original sentence
nlp=spacy.load('en_core_web_sm')
sentiment_lexicon=affinity_scores

def calculate_sentiment(text:str=None):
    sent_score=0
    if text:
        sentence=nlp(text)
        for word in sentence:
            sent_score+=sentiment_lexicon.get(word.lemma_,0)
    return sent_score

In [None]:
# manual testing
calculate_sentiment(text='great')

In [None]:
# Calculating sentiment value for each sentence
sent_df['sentiment_value']=sent_df['sentence'].apply(calculate_sentiment)
sent_df['sentiment_value']

In [None]:
# how many words are there in a sentence?
sent_df['word_count']=sent_df['sentence'].str.split().apply(len)
sent_df['word_count']

In [None]:
sent_df.sort_values(by='sentiment_value')

In [None]:
# Sentiment score of the whole review
sent_df['sentiment_value'].describe()

In [None]:
# negative sentiment score of the whole review
sent_df[sent_df['sentiment_value']<=0]

In [None]:
# Adding index cloumn
sent_df['index']=range(0,len(sent_df))# Plotting the line plot for sentiment value of whole review
plt.figure(figsize=(15,10))

In [None]:
# Plotting the sentiment value for whole review

plt.figure(figsize=(15,10))
sns.distplot(sent_df['sentiment_value'])

In [None]:
# Plotting the line plot for sentiment value of whole review
plt.figure(figsize=(15,10))
sns.lineplot(y='sentiment_value',x='index',data=sent_df)

In [None]:
# Correlation analysis
sent_df.plot.scatter(x='word_count',y='sentiment_value',figsize=(8,8),title='Sentence sentiment value to sentence word count')