In [1]:
#import libraries
import pandas as pd
import numpy as np
import string

In [2]:
#import dataset
tweets = pd.read_csv("Elon_musk.csv", encoding="latin1", index_col=0, error_bad_lines=False)
tweets.head()

Unnamed: 0,Text
1,@kunalb11 Im an alien
2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...
3,@joerogan @Spotify Great interview!
4,@gtera27 Doge is underestimated
5,@teslacn Congratulations Tesla China for amazi...


## **Text Preprocessing** 

In [3]:
book = [x.strip() for x in tweets.Text] #remove both the leading and the trailing characters
book = [x.translate(x.maketrans("","",string.punctuation)) for x in book] # punctuation removal
book = [x.lower() for x in book] #convert all the text to lower case
book[0:5]

['kunalb11 i\x92m an alien',
 'idaacarmack ray tracing on cyberpunk with hdr is nextlevel have you tried it',
 'joerogan spotify great interview',
 'gtera27 doge is underestimated',
 'teslacn congratulations tesla china for amazing execution last year now on to the next for even more']

In [4]:
#Update processed data into the dataframe
tweet_df = pd.DataFrame(book,columns=["tweets"])
tweet_df.head()

Unnamed: 0,tweets
0,kunalb11 im an alien
1,idaacarmack ray tracing on cyberpunk with hdr ...
2,joerogan spotify great interview
3,gtera27 doge is underestimated
4,teslacn congratulations tesla china for amazin...


In [5]:
#import stopwords, postive lexicon and negative lexicon
stop_words = pd.read_csv("stop.txt", names=["words"])
stop_words = list(stop_words.words)
pos_lexicon = pd.read_csv("positive-words.txt", names=["p_lexicon"])
pos_lexicon = list(pos_lexicon.p_lexicon)
neg_lexicon = pd.read_csv("negative-words.txt", names=["n_lexicon"], encoding="latin1")
neg_lexicon = list(neg_lexicon.n_lexicon)

### Function to calculate sentiment

In [7]:
#Functiion to calculate the score for whole sentence
# nlp = spacy.load("en_core_web_sm")

#Tokenization
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordlem = WordNetLemmatizer()

#Function
def calculate_sentiment(text):
    sent_score = 0
    if text:
        tokenized = word_tokenize(text)
        lemmetized = [wordlem.lemmatize(word,pos="v") for word in tokenized]
        no_stopword_sent = [word for word in lemmetized if not word in stop_words]
        for word in no_stopword_sent:
            if word in pos_lexicon:
                sent_score +=1
            if word in  neg_lexicon:
                sent_score -=1 
    return sent_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\praing57504\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\praing57504\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#Adding sentiment score in the main dataframe tweets
tweets["Score"] = tweet_df["tweets"].apply(calculate_sentiment)
tweets.head()

Unnamed: 0,Text,Score
1,@kunalb11 Im an alien,0.0
2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,1.0
3,@joerogan @Spotify Great interview!,-1.0
4,@gtera27 Doge is underestimated,2.0
5,@teslacn Congratulations Tesla China for amazi...,1.0


In [9]:
#Function to classify the tweet based on score
def classification(score):
  if score == 0:
    t_class = "Neutral"
  elif score > 0:
    t_class = "Positive"
  else:
    t_class = "Negative"
  return t_class

In [10]:
#Adding column for tweet classification

tweets["Tweet_Classification"] = tweets["Score"].apply(classification)

In [11]:
tweets.head()

Unnamed: 0,Text,Score,Tweet_Classification
1,@kunalb11 Im an alien,0.0,Neutral
2,@ID_AA_Carmack Ray tracing on Cyberpunk with H...,1.0,Positive
3,@joerogan @Spotify Great interview!,-1.0,Negative
4,@gtera27 Doge is underestimated,2.0,Positive
5,@teslacn Congratulations Tesla China for amazi...,1.0,Positive
