<a href="https://colab.research.google.com/github/Swastikbhat-lab/Stock-Sentiment-Analysis/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Link to Open in Colab
# ![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)

# # Stock Sentiment Analysis

# This notebook aims to predict whether a stock is bullish or bearish based on sentiment data.

## Import Libraries
import nltk
import re
import string
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Download necessary NLTK data
nltk.download('stopwords')

## Load Data
# Assuming the CSV file is uploaded in Colab environment
data = pd.read_csv('/content/Sentiment_Stock_data.csv')

# Display the first few rows of the dataset
data.head()

## Preprocess Text Data
def process_stock_sentence(sentence):
    if not isinstance(sentence, str):
        sentence = str(sentence)  # Convert to string if not already
    stemmer = nltk.PorterStemmer()
    stopwords_english = stopwords.words('english')
    sentence = re.sub(r'\$[A-Za-z]+', '', sentence)  # Remove stock symbols
    sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence)  # Remove URLs
    sentence = re.sub(r'#', '', sentence)  # Remove hashtags
    tokenizer = nltk.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(sentence)

    clean_tokens = [stemmer.stem(word) for word in tokens if word not in stopwords_english and word not in string.punctuation]
    return clean_tokens

## Feature Engineering
def build_stock_freqs(sentences, sentiments):
    freqs = {}
    for sentence, sentiment in zip(sentences, sentiments):
        for word in process_stock_sentence(sentence):
            pair = (word, sentiment)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1
    return freqs

def extract_features(sentence, freqs):
    word_l = process_stock_sentence(sentence)
    x = np.zeros(3)
    x[0] = 1

    for word in word_l:
        x[1] += freqs.get((word, 1.0), 0)
        x[2] += freqs.get((word, 0.0), 0)
    return x

## Train Model
def train_stock_sentiment_model(sentences, sentiments):
    freqs = build_stock_freqs(sentences, sentiments)
    X = np.zeros((len(sentences), 3))
    y = np.array(sentiments)

    for i, sentence in enumerate(sentences):
        X[i, :] = extract_features(sentence, freqs)

    model = LogisticRegression()
    model.fit(X, y)
    return model, freqs

## Predict Sentiment
def predict_stock_sentiment(sentence, model, freqs):
    features = extract_features(sentence, freqs)
    sentiment = model.predict(features.reshape(1, -1))[0]
    return 'Bullish' if sentiment == 1 else 'Bearish'

## Example Usage
# Split data into training and testing sets
sentences = data['Sentence'].values
sentiments = data['Sentiment'].values
train_x, test_x, train_y, test_y = train_test_split(sentences, sentiments, test_size=0.2, random_state=42)

# Train the model
model, freqs = train_stock_sentiment_model(train_x, train_y)

# Test the model
predictions = [predict_stock_sentiment(sentence, model, freqs) for sentence in test_x]
actual = ['Bullish' if sentiment == 1 else 'Bearish' for sentiment in test_y]

# Evaluate the model
accuracy = accuracy_score(actual, predictions)
print(f'Model Accuracy: {accuracy:.4f}')

# Predict sentiment for a new sentence
example_sentence = "The company reported a significant increase in revenue."
prediction = predict_stock_sentiment(example_sentence, model, freqs)
print(f'Sentiment for the example sentence: {prediction}')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model Accuracy: 0.5303
Sentiment for the example sentence: Bearish
