In [25]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer

In [26]:
# Download the stopwords and punkt tokenizer
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vidyapani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vidyapani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
# Download the WordNet lemmatizer data
nltk.download('wordnet')

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vidyapani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
# Download the vader lexicon for sentiment analysis
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\vidyapani\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [29]:
#uplaod the data from your local directory
df = pd.read_csv('data_collection.csv',encoding='latin1')

# Remove rows with null values
df = df.dropna()

# Read the news headlines from a CSV file and drop any duplicates
df.drop_duplicates(inplace=True)

In [30]:
# Preprocess the text by removing stop words, punctuation, and converting to lowercase
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text.lower())
    # add lemmatizer
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha()]
    # Remove stopwords and punctuation
    words = [w for w in words if w not in stop_words and re.match(r'[^\W\d]', w)]
    # Join the remaining words back into a string
    return ' '.join(words)

In [31]:
# Define the preprocess function with a polarity threshold of 0.2

def preprocess(text, polarity_threshold=0.1):
    scores = sid.polarity_scores(text)
    polarity = scores['compound']

    if polarity > polarity_threshold:
        sentiment = 'positive'
    elif polarity < -polarity_threshold:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'

    return sentiment

# Apply the preprocessing function to the headline column
df['headline'] = df['headline_text'].apply(preprocess_text)

# Apply the preprocess function to each row of the preprocessed headline column
df['sentiment'] = df['headline'].apply(preprocess)

#find the label counts 
df['sentiment'].value_counts()

negative    14066
positive     9750
neutral      6658
Name: sentiment, dtype: int64

In [32]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['sentiment'], test_size=0.2, random_state=42)

In [33]:
# Vectorize the text using a bag-of-words model
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [34]:
# Train a SVC classifier on the training data
from sklearn.svm import SVC
clf = SVC(kernel='linear', C=1, random_state=42)
clf.fit(X_train, y_train)

In [11]:
# Test the classifier on the testing data and print the accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy on testing data:', accuracy)

Accuracy on testing data: 0.9340442986054143


In [12]:
# Predict the sentiment of the training data and print the accuracy
y_pred_train = clf.predict(X_train)
accuracy_train = accuracy_score(y_train, y_pred_train)
print('Accuracy on training data:', accuracy_train)

Accuracy on training data: 0.9869559867098733


In [24]:
# Predict the sentiment of new input text
input_text = "Stock market closes with minimal change after mixed earnings reports"
preprocessed_text = preprocess_text(input_text)
vectorized_text = vectorizer.transform([preprocessed_text])
prediction = clf.predict(vectorized_text)
print('Prediction:', prediction[0])

Prediction: neutral


In [24]:
#API

In [35]:
!pip install -U flask-cors
import flask
import io
import string
import time
import os
import numpy as np
import tensorflow as tf
from flask import Flask, jsonify, request 
import joblib
from flask import Flask
from flask_cors import CORS

# saving our model
joblib.dump(clf , 'model_jlib')

# opening the file
m_jlib = joblib.load('model_jlib')

app = Flask(__name__)
CORS(app)

# define the API endpoint
@app.route('/predict', methods=['POST'])
def predict():
    # receive the input data from the request
    input_data = request.get_json().get("data")

    # make predictions using the loaded model 
    try:
        preprocessed_text = preprocess_text(input_data)
        vectorized_text = vectorizer.transform([preprocessed_text])
        prediction = clf.predict(vectorized_text)
        return jsonify({'prediction': prediction[0]})
        
    except Exception as e:
        return str(e)

if __name__ == '__main__':
    app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [03/Mar/2023 09:58:22] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 09:58:22] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 09:58:30] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 09:58:30] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 09:58:39] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 09:58:39] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:01:24] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:01:24] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:01:57] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:01:57] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:03:24] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:03:24] "POST /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:04:24] "OPTIONS /predict HTTP/1.1" 200 -
127.0.0.1 - - [03/Mar/2023 10:04:24