# Assignment

Perform sentimental analysis on the Elon-musk tweets

In [1]:
import nltk
nltk.download('all')

KeyboardInterrupt: 

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import imread
import spacy
import string

from wordcloud import WordCloud

In [None]:
tweets = pd.read_csv("Elon_musk.csv",encoding='Latin-1')
tweets

In [None]:
tweets_data = tweets.drop(['Unnamed: 0'],axis=1)
tweets_data

In [None]:
#Text Preprocessing
tweets_data = [Text.strip() for Text in tweets_data.Text] # remove both the leading and the trailing characters
tweets_data = [Text for Text in tweets_data if Text] # removes empty strings, because they are considered in Python as False
tweets_data[0:10]

In [None]:
#Joing all the tweets into a string/text
Tweets_text = ' '.join(tweets_data)
Tweets_text

In [None]:
#using twittertokenizer to remove usernames
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer(strip_handles=True)
tweets_tokens=tknzr.tokenize(Tweets_text)
print(tweets_tokens[:50])

In [None]:
#Joining all the tweets_tokens into a text
tweets_tokens_text = ' '.join(tweets_tokens)
tweets_tokens_text

In [None]:
no_punch_tweet_text = tweets_tokens_text.translate(str.maketrans('','',string.punctuation))
no_punch_tweet_text

In [None]:
#Removing urls
no_url_tweet_text=re.sub(r'http\S+', '', no_punch_tweet_text)
no_url_tweet_text[:1000]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

In [None]:
text_tokens = word_tokenize(no_url_tweet_text)
print(text_tokens[0:100])

In [None]:
len(text_tokens)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
my_stop_words = stopwords.words('english')

add_stopwords = ['the','\x92','U0001F923','rt','haha','ye']
my_stop_words.extend(add_stopwords)
no_stopwords_tokens = [word for word in text_tokens if not word in my_stop_words]
print(no_stopwords_tokens[0:100])

In [None]:
lower_words = [x.lower() for x in no_stopwords_tokens]
print(lower_words[:100])

In [None]:
#Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_tokens = [ps.stem(word) for word in lower_words]
print(stemmed_tokens[:100])

In [None]:
#lemmatizating and using spacy library
nlp = spacy.load('en_core_web_sm')
doc = nlp(' '.join(lower_words))
print(doc[:100])

In [None]:
lemmas = [token.lemma_ for token in doc]
print(lemmas[:100])

In [None]:
#Feature Extraction

In [None]:
#Using CountVEctorizer
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
tweet_cv = cv.fit_transform(lemmas)

In [None]:
print(cv.vocabulary_)

In [None]:
print(cv.get_feature_names()[50:100])
print(tweet_cv.toarray()[50:100])

In [None]:
tweet_cv.toarray().shape

In [None]:
#Using Bigrams and Trigrams
cv_ngram_range = CountVectorizer(analyzer='word',ngram_range=(1,3),max_features = 100)
bow_matrix_ngram =cv_ngram_range.fit_transform(tweets_data)

In [None]:
print(cv_ngram_range.get_feature_names()[50:100])
print(bow_matrix_ngram.toarray()[50:100])

In [None]:
#TFidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfv_ngram_max_features = TfidfVectorizer(norm='l2',analyzer='word',ngram_range=(1,3),max_features=500)
tfidfv_matrix_ngram = tfidfv_ngram_max_features.fit_transform(tweets_data)

In [None]:
print(tfidfv_ngram_max_features.get_feature_names())
print(tfidfv_matrix_ngram.toarray())

In [None]:
clean_tweets=' '.join(lemmas)
clean_tweets

In [None]:
#Generate Word Cloud

from wordcloud import STOPWORDS

def plot_cloud(wordcloud):
    plt.figure(figsize=(40, 30))
    plt.imshow(wordcloud)
    plt.axis("off");
    
STOPWORDS.add('rt')
wordcloud = WordCloud(width = 3000, height = 2000, background_color='black', max_words=100,colormap='Set2',stopwords=STOPWORDS).generate(clean_tweets)
plot_cloud(wordcloud)

In [None]:
#Named Entity Recognition
nlp = spacy.load('en_core_web_sm')
one_bloclk = clean_tweets
doc_block = nlp(one_bloclk)
spacy.displacy.render(doc_block,style='ent',jupyter=True)

In [None]:
for token in doc_block[0:100]:
    print(token, token.pos_)

In [None]:
nouns_verbs = [token.text for token in doc_block if token.pos_ in ('NOUN', 'VERB')]
print(nouns_verbs[0:50])

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(nouns_verbs)
sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
wf_df = pd.DataFrame(words_freq)
wf_df.columns = ['word', 'count']

wf_df[0:20]

In [None]:
wf_df[0:10].plot.bar(x='word', figsize=(12,8), title='Top verbs and nouns')

In [None]:
tweets_data1 = tweets.drop[]
tweets_data1

In [None]:
from nltk import tokenize
sentences = tokenize.sent_tokenize(" ".join(tweets_data))
sentences[0:15]

In [None]:
sent_df = pd.DataFrame(sentences, columns=['sentence'])
sent_df

In [None]:
sent_df['word_count'] = sent_df['sentence'].str.split().apply(len)
sent_df['word_count'].head(10)

In [None]:
affin = pd.read_csv('Afinn.csv',sep=',',encoding='Latin-1')
affin

In [None]:
affinity_scores = affin.set_index('word')['value'].to_dict()

In [None]:
nlp = spacy.load('en_core_web_sm')
sentiment_lexicon = affinity_scores

def calculate_sentiment(text: str = None):
    sent_score = 0
    if text:
        sentence = nlp(text)
        for word in sentence:
            sent_score += sentiment_lexicon.get(word.lemma_, 0)
    return sent_score

In [None]:
calculate_sentiment(text = 'super')

In [None]:
sent_df['sentiment_value'] = sent_df['sentence'].apply(calculate_sentiment) 
sent_df['sentiment_value']

In [None]:
sent_df

In [None]:
sent_df.sort_values(by='sentiment_value').tail(10)

In [None]:
sent_df['sentiment_value'].describe()

In [None]:
sent_df[sent_df['sentiment_value']<=0].head()

In [None]:
sent_df['index']=range(0,len(sent_df))
sent_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(sent_df['sentiment_value'])

In [None]:
plt.figure(figsize=(15, 10))
sns.lineplot(y='sentiment_value',x='index',data=sent_df)

In [None]:
sent_df.plot.scatter(x='word_count', y='sentiment_value', figsize=(8,8), title='Sentence sentiment value to sentence word count')

In [None]:
#checking for the raw data
tweets_data1

In [None]:
tweets_data1['sentiment_value'] = tweets_data1['Text'].apply(calculate_sentiment) 
tweets_data1['sentiment_value']

In [None]:
tweets_data1

In [None]:
tweets_data1.sort_values(by='sentiment_value').tail(10)