# Sentiment Analysis

In [130]:
import pandas as pd

ticker = "MSFT"
df = pd.read_csv(f"{ticker}_News_Content.csv")

df.head()

Unnamed: 0,title,time_published,summary,overall_sentiment_score,overall_sentiment_label,ticker,relevance_score,ticker_sentiment_score,ticker_sentiment_label
0,"Facebook, YouTube, TikTok restrict access to R...",20220302T030642,"YouTube, Facebook, Instagram and TikTok are re...",-0.14593,Neutral,MSFT,0.079078,-0.092053,Neutral
1,"Apple, Google, Netflix, and Visa join the grow...",20220302T080000,List of companies pulling out of Russia keeps ...,-0.520567,Bearish,MSFT,0.127795,-0.688359,Bearish
2,Ukraine calls on gaming industry to suspend bu...,20220302T080000,The Ukrainian government is beseeching video g...,-0.162332,Somewhat-Bearish,MSFT,0.09923,-0.215272,Somewhat-Bearish
3,Qualcomm teams up with ByteDance on metaverse-...,20220302T133018,"Qualcomm, the US semiconductor and wireless te...",0.056736,Neutral,MSFT,0.067594,0.106466,Neutral
4,MarketingPulse and eTailingPulse to shed light...,20220303T065619,[Sponsored Article]\r\rThe Covid-19 pandemic h...,0.010325,Neutral,MSFT,0.045456,0.007051,Neutral


In [131]:
# Filter rows with relevance_score > 0.5
df = df[df['relevance_score'] > 0.5]
df = df[df['ticker_sentiment_score'] > 0]

# Extract date and time components
df['Date'] = pd.to_datetime(df['time_published'], format='%Y%m%dT%H%M%S').dt.date
df['Time'] = pd.to_datetime(df['time_published'], format='%Y%m%dT%H%M%S').dt.time

# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'])

df_news = df.reset_index()

# Drop unnecessary columns
columns_to_drop = ['ticker', 'time_published', 'Time', 'overall_sentiment_label', 'ticker_sentiment_label']
df_news.drop(columns=columns_to_drop, inplace=True)

# Group by 'Date' and concatenate titles and summaries into a single paragraph
agg_functions = {
    'title': ' '.join,  # Concatenate titles
    'summary': ' '.join,  # Concatenate summaries
    'overall_sentiment_score': 'mean',  # Calculate the mean
    'relevance_score': 'mean',  # Calculate the mean
    'ticker_sentiment_score': 'mean'  # Calculate the mean
}

new_df = df_news.groupby('Date').agg(agg_functions).reset_index()

# Rename columns if needed
new_df.rename(columns={'overall_sentiment_score': 'average_overall_sentiment_score',
                          'relevance_score': 'average_relevance_score',
                          'ticker_sentiment_score': 'average_ticker_sentiment_score'},
                 inplace=True)

# Define a custom function to apply the condition
def calculate_sentiment(score):
    if score > 0.5:
        return 1
    else:
        return 0

# Apply the custom function to create the 'sentiment score' column
new_df['sentiment_score'] = new_df['average_overall_sentiment_score'].apply(calculate_sentiment)


new_df.tail()

Unnamed: 0,Date,title,summary,average_overall_sentiment_score,average_relevance_score,average_ticker_sentiment_score,sentiment_score
438,2023-09-23,1 Rock-Solid Warren Buffett Artificial Intelli...,"A broad customer base, a fortress-like balance...",0.22264,0.752228,0.298543,0
439,2023-09-25,Microsoft is hiring a nuclear energy expert to...,Microsoft is looking to hire a nuclear energy ...,0.209654,0.627831,0.223343,0
440,2023-09-26,"Microsoft, Amazon And Google Among Best AI Inv...",TD Cowen has expressed increased optimism for ...,0.2584,0.637906,0.408222,0
441,2023-09-27,Infosys and Microsoft collaborate to bring enh...,Infosys announced that it is collaborating wit...,0.37704,0.636603,0.575403,0
442,2023-09-28,Microsoft's Security Glitch Reportedly Resulte...,The U.S. State Department suffered a significa...,0.21558,0.56704,0.24935,0


In [132]:
train_size = int(len(new_df) * 0.8)
train, test = new_df[0:train_size], new_df[train_size:]

#Removing punctuations of title and summary columns
df_train = train.iloc[:, :11]
df_train = df_train.replace("[^a-zA-Z]", " ", regex=True)

# Convert 'title' column to lowercase
df_train['title'] = df_train['title'].str.lower()
# Convert 'summary' column to lowercase
df_train['summary'] = df_train['summary'].str.lower()

train_summary_list = df_train['summary'].tolist()

print(train_summary_list[1])

situated in hyderabad  this will be the microsoft s largest data centre in the country shares of mandiant  a      billion cybersecurity firm  closed up     after a report said google is interested in acquiring the company 


In [133]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

#implementing bag-of-words
countVector = CountVectorizer(ngram_range=(3,3))
trainDataset = countVector.fit_transform(train_summary_list)

#implementing random forest classifier
randomClassifier = RandomForestClassifier(n_estimators=200, criterion='entropy')
randomClassifier.fit(trainDataset, train['sentiment_score'])

#predict for the test dataset

#Removing punctuations of title and summary columns
df_test = test.iloc[:, :11]
df_test = df_test.replace("[^a-zA-Z]", " ", regex=True)

# Convert 'title' column to lowercase
df_test['title'] = df_test['title'].str.lower()
# Convert 'summary' column to lowercase
df_test['summary'] = df_test['summary'].str.lower()

test_summary_list = df_test['summary'].tolist()

print(test_summary_list[0])

microsoft  msft  has been one of the stocks most watched by zacks com users lately  so  it is worth exploring what lies ahead for the stock  someone with a lot of money to spend has taken a bearish stance on microsoft msft  we noticed this today when the big position showed up on publicly available options history that we track here at benzinga  whether this is an institution or just a wealthy individual  we don t know  microsoft     down for more than        online users   rd outage     business standard     microsoft     down for thousands of users   downdetector the financial express     tech behemoth microsoft corporation   msft   pioneers the development  licensing  and support of software  services  devices  and solutions  it offers various products such as pcs  tablets  gaming and entertainment consoles  video games  and search services  including bing and microsoft    


In [134]:
testDataset = countVector.transform(test_summary_list)
predictions = randomClassifier.predict(testDataset)
print(predictions)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [135]:
#import library to check the accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

matrix = confusion_matrix(test['sentiment_score'], predictions)
print(matrix)
score = accuracy_score(test['sentiment_score'], predictions)
print(score)
report = classification_report(test['sentiment_score'], predictions)
print(report)

[[89]]
1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        89

    accuracy                           1.00        89
   macro avg       1.00      1.00      1.00        89
weighted avg       1.00      1.00      1.00        89

