# **About the Dataset**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,auc
from xgboost import XGBClassifier

In [None]:
df = pd.read_csv('financial_sentiment_data.csv')
df.head() 

In [None]:
df.shape

In [None]:
from textblob import TextBlob

# Read in the dataframe from a CSV file
df = pd.read_csv('financial_sentiment_data.csv')

# Define a function to calculate polarity scores and update sentiment labels
def update_sentiment(row):
    sentence = row['Sentence']
    sentiment = row['Sentiment']
    
    # Calculate the polarity score for the sentence
    polarity_score = TextBlob(sentence).sentiment.polarity
    
    # Check if there are any other rows with the same sentence
    duplicate_rows = df[df['Sentence'] == sentence]
    
    # If there are no duplicates, return the original sentiment label
    if len(duplicate_rows) == 1:
        return sentiment
    
    # Otherwise, find the sentiment label with the highest polarity score and update the original sentiment label
    max_score = polarity_score
    max_sentiment = sentiment
    for _, dup_row in duplicate_rows.iterrows():
        dup_sentiment = dup_row['Sentiment']
        dup_score = TextBlob(dup_row['Sentence']).sentiment.polarity
        if dup_score > max_score:
            max_score = dup_score
            max_sentiment = dup_sentiment
    return max_sentiment

# Apply the function to each row in the dataframe to update the Sentiment column with the updated sentiment labels
df['Sentiment'] = df.apply(update_sentiment, axis=1)

# Print the final result
print(df)


# **Basic Exploratory Data Analysis**

In [None]:
# Review one code
df['Sentence'][0]

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Sentiment'].value_counts()

The data is imbalanced.

In [None]:
df.shape

# **DataFrame Seperation** 

In [None]:
positive_df= pd.DataFrame(columns = df.columns)
negative_df = pd.DataFrame(columns = df.columns)
neutral_df = pd.DataFrame(columns = df.columns)

In [None]:
for index, row in df.iterrows():
    #For positive sentiment
    if row["Sentiment"] == "positive":
        positive_df = positive_df.append(row,ignore_index=True)
    #For negative sentiment
    elif row["Sentiment"] == "negative":
        negative_df = negative_df.append(row,ignore_index=True)
    #For neutral sentiment
    elif row["Sentiment"] == "neutral":
        neutral_df = neutral_df.append(row,ignore_index=True)

# **Text Cleaning**

In [None]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk 
nltk.download('stopwords')


def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    #text = re.sub(r'\d+', '', text)
    
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    words = nltk.word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words to form text
    text = ' '.join(words)
    
    return text

In [None]:
df['clean_text'] = df['Sentence'].apply(clean_text)
df

In [None]:
def rem(text):
  words=text.split()
  my_stopwords = stopwords.words('english')
  stopwords_to_add = ('mn','oyj','ab','inbev','ftsc','plc','afsc','eur','mln','hel','omx','esi')
  my_stopwords.extend(stopwords_to_add)
  filtered_words = [word for word in words if word.lower() not in my_stopwords]
  return ' '.join(filtered_words)

In [None]:
df['clean_text'] = df['clean_text'].apply(rem)

In [None]:
df

In [None]:
positive_df['clean_text_positive'] = positive_df['Sentence'].apply(clean_text)
positive_df

In [None]:
negative_df['clean_text_negative'] = negative_df['Sentence'].apply(clean_text)
negative_df

In [None]:
neutral_df['clean_text_neutral'] = neutral_df['Sentence'].apply(clean_text)
neutral_df

# **Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [None]:
df.head()

# **TFIDF - Term frequency inverse Document Frequency**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

In [None]:
X_tfidf = vectorizer.fit_transform(df['clean_text'])

In [None]:
vectorizer.vocabulary_

In [None]:
feature_names = list(vectorizer.vocabulary_.keys())

In [None]:
X_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)

In [None]:
X_tfidf.head()

In [None]:
y = df['Sentiment']

In [None]:
!pip install imblearn

# **Balancing the data by Smote**

In [None]:
# Apply SMOTE to balance dataset

from imblearn.over_sampling import SMOTE

y = df['Sentiment']
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X_tfidf, y)

y.value_counts()

# **Multinomial Naive Bayes**

In [None]:
# Split resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Multinomial Naive Bayes model
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

In [None]:
# Predict on test data
y_pred = mnb.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

Scores=cross_val_score(mnb, X=X, y=y , cv = 5)
print(Scores)

Saving the trained Model


In [None]:
import pickle

In [None]:
pickle.dump(mnb, open('mnb_model.plk','wb'))

In [None]:
pickle.dump(vectorizer,open('tf_idf_model.pkl','wb'))

In [None]:
#loading the saved model
loaded_model = pickle.load(open('trained-model.sav','rb'))

In [None]:
def predict_sentiment(input_text, loaded_model):
    # Load the vectorizer used during training
    vectorizer = TfidfVectorizer()

    # Clean the input text
    input_text = clean_text(input_text)
    
    # Remove stop words
    input_text = rem(input_text)

    # Vectorize the input text
    input_text_vectorized = vectorizer.transform([input_text])

    # Make a prediction using the loaded model
    result = mnb.predict(input_text_vectorized)[0]

    # Convert the prediction into a human-readable sentiment label
    if result == 0:
        sentiment = 'Negative'
    elif result == 1:
        sentiment = 'Neutral'
    elif result == 2:
        sentiment = 'Positive'

    return sentiment

In [None]:
import inspect

# Get the definition of the predict_sentiment function
definition = inspect.getsource(predict_sentiment)

# Print the definition
print(definition)