## Stock news prediction

In [None]:
import pandas as pd


In [None]:
#importing the dataset for the further prediction
df=pd.read_csv("/content/drive/MyDrive/internship evaluation task/stock_sentiment.csv")

In [None]:
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB


In [None]:
df.shape

(5791, 2)

In [None]:
#Checking the null values if there any..
df.isnull().sum()

Text         0
Sentiment    0
dtype: int64

In [None]:
df["Sentiment"].unique()

array([1, 0])

In [None]:
df["Sentiment"].value_counts()

1    3685
0    2106
Name: Sentiment, dtype: int64

In [None]:
# Data cleaning operations
#1.Removing the punctuations
import re
import string



In [None]:
#Removal of the punctuations
def remove_punctuations(Text):
    Text = Text.lower()
    Text = re.sub(f"[{re.escape(string.punctuation)}]", "", Text)
    return Text

In [None]:
df['cleaned_headline'] = df['Text'].apply(remove_punctuations)

In [None]:
df

Unnamed: 0,Text,Sentiment,cleaned_headline
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kickers on my watchlist xide tit soq pnk cpw b...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie 55 return for the feageed indic...
2,user I'd be afraid to short AMZN - they are lo...,1,user id be afraid to short amzn they are look...
3,MNTA Over 12.00,1,mnta over 1200
4,OI Over 21.37,1,oi over 2137
...,...,...,...
5786,Industry body CII said #discoms are likely to ...,0,industry body cii said discoms are likely to s...
5787,"#Gold prices slip below Rs 46,000 as #investor...",0,gold prices slip below rs 46000 as investors b...
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1,workers at bajaj auto have agreed to a 10 wage...
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1,sharemarket live sensex off day’s high up 600 ...


In [None]:
# Splitting ot the data into training set and testing set.
X = df['cleaned_headline']
y = df['Sentiment']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Converting the strings into numericals by using count vectoriser function
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Model with naive bayes algorithm.
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.77


In [None]:
#Model with random forest algorithm
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=150, random_state=42)
rf_model.fit(X_train_vec, y_train)

In [None]:
# Model evaluation
y_pred = rf_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.79


In [None]:
# Hyper parameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
from sklearn.model_selection import GridSearchCV
rf_model = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3)
grid_search.fit(X_train_vec, y_train)

best_rf_model = grid_search.best_estimator_

In [None]:
y_pred = best_rf_model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


In [None]:
#Prediction of the new news.
news_to_predict = ["Positive news about Company XYZ boosts stock prices."]
news_to_predict_cleaned = [remove_punctuations(news) for news in news_to_predict]
news_to_predict_vec = vectorizer.transform(news_to_predict_cleaned)
predicted_movement = best_rf_model.predict(news_to_predict_vec)
print(f"Predicted Movement: {'Positive' if predicted_movement[0] == 1 else 'Negative'}")

Predicted Movement: Positive


In [None]:
news_to_predict = ["Negative news leads to decrease in the stock prices"]
news_to_predict_cleaned = [remove_punctuations(news) for news in news_to_predict]
news_to_predict_vec = vectorizer.transform(news_to_predict_cleaned)
predicted_movement = best_rf_model.predict(news_to_predict_vec)
print(f"Predicted Movement: {'Positive' if predicted_movement[0] == 1 else 'Negative'}")

Predicted Movement: Negative


In [None]:
news_to_predict = ["we are going to take an action for the things that you have done "]
news_to_predict_cleaned = [remove_punctuations(news) for news in news_to_predict]
news_to_predict_vec = vectorizer.transform(news_to_predict_cleaned)
predicted_movement = best_rf_model.predict(news_to_predict_vec)
print(f"Predicted Movement: {'Positive' if predicted_movement[0] == 1 else 'Negative'}")

Predicted Movement: Positive


In [None]:
news_to_predict = ["This is the worst stock among all the stocks "]
news_to_predict_cleaned = [remove_punctuations(news) for news in news_to_predict]
news_to_predict_vec = vectorizer.transform(news_to_predict_cleaned)
predicted_movement = best_rf_model.predict(news_to_predict_vec)
print(f"Predicted Movement: {'Positive' if predicted_movement[0] == 1 else 'Negative'}")

Predicted Movement: Negative


In [None]:
news_to_predict = ["your stocks are good "]
news_to_predict_cleaned = [remove_punctuations(news) for news in news_to_predict]
news_to_predict_vec = vectorizer.transform(news_to_predict_cleaned)
predicted_movement = best_rf_model.predict(news_to_predict_vec)
print(f"Predicted Movement: {'Positive' if predicted_movement[0] == 1 else 'Negative'}")

Predicted Movement: Positive


In [None]:
import pickle

In [None]:
pickle.dump(rf_model, open('stock_news_prediction.pkl', 'wb'))