In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

In [None]:
import pandas as pd
import numpy as np
import string
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
news_headlines=pd.read_csv("india-news-headlines.csv")

In [None]:
stock_price_data=pd.read_csv("^NSEI (1).csv")

In [None]:
stock_price_data.dropna(inplace=True)

In [None]:
stock_price_data

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor=ColumnTransformer(
  transformers=[('scaled',StandardScaler(),[1,2,3,4,5,6])], remainder='passthrough'
)


In [None]:
stock_price_data=pd.DataFrame(preprocessor.fit_transform(stock_price_data),columns=['Open','High','Low','Close','Adj Close','Volume','Date'])

In [None]:
from datetime import datetime
news_headlines['Date'] = news_headlines['publish_date'].astype(str).apply(lambda x: datetime.strptime(x, "%Y%m%d").strftime("%Y/%m/%d"))

In [None]:
news_headlines

In [None]:
stock_price_data['MA10'] = stock_price_data['Close'].rolling(window=10).mean()
stock_price_data['MA50'] = stock_price_data['Close'].rolling(window=50).mean()
stock_price_data['volatility'] = stock_price_data['Close'].rolling(window=10).std()
stock_price_data['returns'] = stock_price_data['Close'].pct_change()
stock_price_data.dropna(inplace=True)

In [None]:
news_headlines['Date']=pd.to_datetime(news_headlines['Date'])

In [None]:
news_headlines=news_headlines[news_headlines['Date']>='2007/10/11']

In [None]:
  news_headlines_grouped = news_headlines.groupby('Date')['headline_text'].apply(' '.join).reset_index()

In [None]:
def preprocess_news(df):
    # Text Cleaning Function
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text

    # Tokenization
    def preprocess_tokens(tokens):
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        return tokens

    # Clean text
    df['cleaned_headline'] = df['headline_text'].apply(clean_text)

    # Tokenization
    df['tokens'] = df['cleaned_headline'].apply(word_tokenize)

    # Stop Words Removal and Lemmatization
    df['processed_tokens'] = df['tokens'].apply(preprocess_tokens)

    # Convert tokens back to string for vectorization
    df['processed_headline'] = df['processed_tokens'].apply(lambda x: ' '.join(x))

    # Vectorization using TF-IDF
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['processed_headline'])

    # Convert TF-IDF matrix to DataFrame
    tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

    # Combine sentiment scores with original DataFrame
    df = pd.concat([df, tfidf_df], axis=1)

    return df

In [None]:
sid = SentimentIntensityAnalyzer()
news_headlines_grouped['sentiment'] = news_headlines_grouped['headline_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

In [None]:
len(news_headlines_grouped.iloc[0,1])

In [None]:
processed_news_df = preprocess_news(news_headlines_grouped)

In [None]:
stock_price_data['Date'] = pd.to_datetime(stock_price_data['Date'])
news_headlines_grouped['Date'] = pd.to_datetime(news_headlines_grouped['Date'])

In [None]:
merged_data = pd.merge(stock_price_data,news_headlines_grouped, on='Date', how='left')
merged_data['sentiment'].fillna(0, inplace=True)
merged_data.dropna(inplace=True)

In [None]:
merged_data.corr()

In [None]:
features = ['MA10', 'MA50','volatility', 'returns', 'sentiment']
X = merged_data[features]
y = merged_data['Close']


In [None]:
features = ['volatility', 'returns', 'sentiment']
X1= merged_data[features]
y1 = merged_data['Close']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X1_train, y1_train)

In [None]:
y1_pred = model.predict(X1_test)
rmse = np.sqrt(mean_squared_error(y1_test, y1_pred))

In [None]:
print(rmse)

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X1, y1, cv=kf, scoring=rmse_scorer)

In [None]:
print(-cv_scores.mean())

In [None]:
import matplotlib.pyplot as plt

# Plot predictions vs actual values
plt.figure(figsize=(14, 7))
plt.plot(y_test, label='Actual Stock Prices')
plt.plot(y_pred, label='Predicted Stock Prices')
plt.title('Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Plot predictions vs actual values
plt.figure(figsize=(14, 7))
plt.plot(y1_test, label='Actual Stock Prices')
plt.plot(y1_pred, label='Predicted Stock Prices')
plt.title('Stock Price Prediction')
plt.xlabel('Time')
plt.ylabel('Stock Price')
plt.legend()
plt.show()