<a href="https://colab.research.google.com/github/PiriPiri57/Local-repo/blob/main/Stock_Market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install yfinance
!pip install yfinance --quiet
!pip install pandas_market_calendars --quiet

# Step 2: Import libraries
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import datetime
import numpy as np
import pandas_market_calendars as mcal
import warnings
warnings.filterwarnings('ignore')
drive.mount('/content/drive')

# Step 3: Choose your stock and time period
ticker = 'RELIANCE.NS'  # You can change this to any valid stock symbol
stock = yf.Ticker(ticker)
nse = mcal.get_calendar('NSE')

# Step 4: Download historical data
today = datetime.datetime.today()

# If today is Monday (0), we need Friday's date (today - 3 days)
# If today is Sunday (6), we need Friday's date (today - 2 days)
# If today is Saturday (5), we need Friday's date (today - 1 day)
# Otherwise (Tuesday to Friday), just take yesterday

start = today - datetime.timedelta(days=10)
end = today

schedule = nse.schedule(start_date=start.strftime('%Y-%m-%d'), end_date=end.strftime('%Y-%m-%d'))
trading_days = schedule.index

# Find the last trading day before or equal to today
if today.date() in trading_days.date:
    last_working_day = today.date()
else:
    last_working_day = trading_days[trading_days < np.datetime64(today)].max().date()

print(f"Last working day: {last_working_day}")

# Step 2: Download data up to the LAST working day (INCLUDE it)
df = yf.download(ticker, end=(last_working_day + datetime.timedelta(days=1)).strftime('%Y-%m-%d'))
df.reset_index(inplace=True)

# Save entire dataset
save_path = "/content/drive/My Drive/DS_Dataset_FinanceTrends.csv"
df.to_csv(save_path, index=True)

# Step 3: Save only last working day's row into another dataset (today_features)
df_copy=df.copy()


# Step 5: Display first few rows
print("First rows of stock data:")
print(df.head())


# Step 6: Plot the Close price
plt.figure(figsize=(12, 6))
plt.plot(df['Close'], label='Close Price', color='blue')
plt.title(f"{ticker} Stock Price Over Time")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
df.isnull().sum()

In [None]:
df.tail()

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(df)

In [None]:
from sklearn.preprocessing import StandardScaler
df['Date'] = pd.to_datetime(df['Date'])
scaler = StandardScaler()
df['Volume_scaled'] = scaler.fit_transform(df[['Volume']])

plt.figure(figsize=(14,6))
plt.plot(df['Date'], df['Volume_scaled'], color='blue')
plt.title('Volume over Time')
plt.xlabel('Date')
plt.ylabel('Volume')
plt.grid(True)
plt.show()

In [None]:
df.drop(columns=['Dividends', 'Stock Splits'], inplace=True, errors='ignore') #Removing constant columns


df.head()

In [None]:
import numpy as np
# --- 1. Date Parts ---
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)

# --- 2. Lag Features ---
df['lag_1'] = df['Close'].shift(1)  # yesterday's close
df['lag_5'] = df['Close'].shift(5)  # close 5 days ago
df['lag_10'] = df['Close'].shift(10)

# --- 3. Rolling Means ---
df['rolling_mean_5'] = df['Close'].rolling(window=5).mean()
df['rolling_mean_10'] = df['Close'].rolling(window=10).mean()
df['rolling_mean_20'] = df['Close'].rolling(window=20).mean()


# --- 4. Returns ---
df['Daily_Return'] = df['Close'].pct_change()
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

df['Log_Volume'] = df['Volume'].shift(1)
df['Rolling_Volume'] = df['Volume'].rolling(3).mean()


# --- Drop NaNs caused by shift/rolling ---
df.dropna(inplace=True)

In [None]:
df_raw=df.copy()
df.drop(columns=['Date'], inplace=True, errors='ignore')  #removing categorical columns, doesnt has any significant affect on data no point of encoding it
df.head()

In [None]:
df.head(8)

In [None]:

plt.figure(figsize=(20, 6))
sns.heatmap(data=df.corr(),annot=True)

In [None]:
df.drop(columns=['lag_5','lag_10','Open','Low','Rolling_Volume','Daily_Return','IsMonthStart','rolling_mean_10','rolling_mean_20'], inplace=True, errors='ignore')   #we are dropping the columns which are higly correlated to avoid overfitting
df.head()

In [None]:
plt.figure(figsize=(15, 6))
sns.heatmap(data=df.corr(),annot=True)

In [None]:
from sklearn.preprocessing import StandardScaler

import seaborn as sns
scaler = StandardScaler()
df[['High', 'Close', 'Volume', 'lag_1', 'rolling_mean_5', 'Log_Return']] = scaler.fit_transform(df[['High', 'Close', 'Volume', 'lag_1', 'rolling_mean_5', 'Log_Return']])
df.head()



In [None]:
df["Close"].describe()

In [None]:
sns.boxplot(df['Close'])                              #Checking for outliers cause for stock market outliers indicate trend breakers
sns.displot(df['Close'])
plt.show()

In [None]:
today_data=df_raw.iloc[-1]
df=df.iloc[:-1]
df.describe()

In [None]:
import warnings
warnings.filterwarnings('ignore')
today_data_df = pd.DataFrame([today_data])
today_data_df.drop(columns=['Date','lag_5','lag_10','Open','Low','Rolling_Volume','Daily_Return','IsMonthStart','rolling_mean_10','rolling_mean_20'], inplace=True, errors='ignore')
today_data_df.describe()

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
# --- 1. Prepare Data ---

# Define X and y
X = df.drop(['Close'], axis=1)  # Drop target
y = df['Close']

# Sort the data by date (Ensure your data is sorted by date/time)
df = df.sort_index()  # Assuming your dataframe is indexed by date

# Split into train and test (chronologically)
train_size = int(len(df) * 0.8)  # 80% training data
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --- 2. Define Models ---

models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# --- 3. Train and Evaluate ---

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R² Score': r2
    }

# --- 4. Display Results ---
results_df = pd.DataFrame(results).T
print(results_df)


In [None]:
best_model_name = results_df['R² Score'].idxmax()
best_model = models[best_model_name]

print(f"Best Model Selected: {best_model_name}")

import matplotlib.pyplot as plt

# Use a default style
plt.style.use('ggplot')  # or 'default', 'bmh', etc.

# Metrics you want to plot
metrics = ['MAE', 'RMSE', 'R² Score']

# Create a figure with 3 subplots (1 row, 3 columns)
fig, axes = plt.subplots(1, 3, figsize=(20, 6))  # Width=20, Height=6

# Loop through each metric and corresponding axis
for idx, metric in enumerate(metrics):
    ax = axes[idx]
    sorted_data = results_df[metric].sort_values(ascending=True if metric != 'R² Score' else False)
    sorted_data.plot(kind='barh', ax=ax, color='skyblue')
    ax.set_title(f'Model Comparison - {metric}')
    ax.set_xlabel(metric)
    ax.set_ylabel('Models')
    ax.grid(True, linestyle='--', alpha=0.7)

# Adjust layout
plt.tight_layout()
plt.show()




# From the above plot we infer that Ridge and Linear Regression are the best fit

In [None]:
# --- Prepare today's data properly ---

# Step 1: Convert today_data to a DataFrame

##########################################
# Step 5: Fill any NaNs
today_data_df.fillna(0, inplace=True)
X=df.drop(['Close'], axis=1)
feature_names = X.columns

# Step 2: Keep only the columns you trained on
today_data_df = today_data_df[feature_names]

# Step 3: Now safely scale
today_data_scaled = scaler.transform(today_data_df)

# Step 7: Predict today's close
today_prediction = best_model.predict(today_data_df)
predicted_close = today_prediction.flatten()[0]

# Step 8: Get actual close
actual_close = today_data['Close'].item()

# Step 9: Calculate accuracy
accuracy = (1 - abs(actual_close - predicted_close) / actual_close) * 100

# Step 10: Display prediction
print("Today's Close Prediction Results")
print(f"Predicted Close Value ({best_model_name}): {predicted_close:.2f}")
print(f"Actual Close Value: {actual_close:.2f}")
print(f"Prediction Accuracy: {accuracy:.2f}%")


In [None]:
df.head()

In [None]:
df['Gain/Fall'] = np.where(df['Close'].shift(-1) > df['Close'], 1, 0)
df.tail()

plt.pie(df['Gain/Fall'].value_counts().values,
        labels=["Fall", "Gain"], autopct='%1.1f%%')
plt.show()

# Step 1: Features and Target for classification
X_cls = df.drop(['Close', 'Gain/Fall'], axis=1)  # Use all features except Close and Gain/Fall make it greeej
y_cls = df['Gain/Fall']                          # Target is Gain/Fall

# Train-Test Split
train_size_cls = int(len(df) * 0.8)
X_train_cls, X_test_cls = X_cls.iloc[:train_size_cls], X_cls.iloc[train_size_cls:]
y_train_cls, y_test_cls = y_cls.iloc[:train_size_cls], y_cls.iloc[train_size_cls:]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
models = [LogisticRegression(),
  XGBClassifier(),
  RandomForestClassifier(n_estimators=100, random_state=42),
  DecisionTreeClassifier(random_state=42),
  KNeighborsClassifier(n_neighbors=5),
  GradientBoostingClassifier(n_estimators=100, random_state=42)]
best_score = 0
best_classifier = None
for i in range(6):
  models[i].fit(X_train_cls, y_train_cls)
  train_auc = metrics.roc_auc_score(
  y_train_cls, model.predict_proba(X_train_cls)[:, 1])
  val_auc = metrics.roc_auc_score(
  y_test_cls, model.predict_proba(X_test_cls)[:, 1])
  print(f'{models[i].__class__.__name__} : ')
  print('Training Accuracy : ', train_auc)
  print('Validation Accuracy : ',val_auc)
  print()
  if val_auc > best_score:
      best_score = val_auc
      best_classifier = model

best_classifier.__class__.__name__

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Create a grid of subplots
fig, axes = plt.subplots(2, 3, figsize=(20, 10))  # 2 rows and 4 columns (since you have 7 models, some axes will be empty)
axes = axes.ravel()  # Flatten the 2D array of axes for easy iteration

# Loop through all models and plot the confusion matrix for each
for i, model in enumerate(models):
    # Get the predicted classes
    y_pred = model.predict(X_test_cls)

    # Plot the confusion matrix on the corresponding subplot
    ConfusionMatrixDisplay.from_estimator(
        model, X_test_cls, y_test_cls, ax=axes[i], cmap='Blues', display_labels=model.classes_)

    axes[i].set_title(f'{model.__class__.__name__}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('True')
    axes[i].grid(False)

# Adjust layout for better readability
plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
import matplotlib.pyplot as plt

# Create a figure for a single plot
plt.figure(figsize=(8, 6))

# Loop through all models and plot the ROC curve for each
for model in models:
    # Get predicted probabilities (not just 0/1 predictions)
    y_probs = model.predict_proba(X_test_cls)[:, 1]  # Probabilities for class 1

    # Calculate ROC curve and AUC
    fpr, tpr, thresholds = roc_curve(y_test_cls, y_probs)
    auc_score = roc_auc_score(y_test_cls, y_probs)

    # Plot ROC curve on the same plot
    plt.plot(fpr, tpr, label=f'{model.__class__.__name__} (AUC = {auc_score:.3f})')

# Plot random chance diagonal (AUC = 0.5)
plt.plot([0, 1], [0, 1], 'k--', label="Random Chance (AUC = 0.5)")

# Add labels and title
plt.title('ROC Curves for All Models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# Add legend
plt.legend(loc='lower right')

# Add grid
plt.grid(True, alpha=0.3)

# Show the plot
plt.show()



In [None]:

today_features = today_data_df.values

# Predict probability of gain (class 1)
prob_gain = best_classifier.predict_proba(today_features)[0][1]  # Probability of Gain

# Make final prediction (with threshold)
prediction = 1 if prob_gain > 0.4 else 0  # You can adjust threshold
print(prediction)

In [None]:
# --- PURE ARIMA MODEL (without pmdarima) ---
import warnings
warnings.filterwarnings('ignore')
# Install if needed
!pip install statsmodels --quiet

# Import libraries
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

# Prepare the series
series = df_raw['Close']
!pip install statsmodels --quiet

# Imports
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Plot ACF and PACF
plt.figure(figsize=(16,6))

plt.subplot(1,2,1)
plot_acf(series, lags=50, ax=plt.gca())
plt.title('Autocorrelation Function (ACF)')

plt.subplot(1,2,2)
plot_pacf(series, lags=100, ax=plt.gca(), method='ywm')
plt.title('Partial Autocorrelation Function (PACF)')

plt.show()

# Split into train/test
train_size = int(len(series) * 0.8)
train, test = series[:train_size], series[train_size:]

# --- Choose ARIMA order manually ---
# Common starting point: (p,d,q) = (5,1,0) or (2,1,2)
# You can tune these based on ACF/PACF plots if you want!


In [None]:
# Install if needed
!pip install statsmodels --quiet

# Import libraries
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
# Prepare the series
series = df_raw['Close']

# Split into train/test
train_size = int(len(series) * 0.8)
train, test = series[:train_size], series[train_size:]

# --- Set SARIMA orders ---
order = (1, 0, 1)
seasonal_order = (1, 0, 1, 24)      #seasonality is 24 that is 2 years

# --- Train SARIMA Model ---
sarima_model = SARIMAX(train,
                       order=order,
                       seasonal_order=seasonal_order,
                       enforce_stationarity=False,
                       enforce_invertibility=False)
sarima_result = sarima_model.fit()

# --- Forecast ---
n_periods = len(test)
forecast = sarima_result.forecast(steps=n_periods)

# --- Evaluate ---
rmse = np.sqrt(mean_squared_error(test, forecast))
print(f'SARIMA RMSE: {rmse:.2f}')

# --- Plot ---
plt.figure(figsize=(14,6))
plt.plot(train.index, train, label='Train')
plt.plot(test.index, test, color='green', label='Test')
plt.plot(test.index, forecast, color='red', label='Forecast (SARIMA)')
plt.title('SARIMA Forecast vs Actuals')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# --- Predict Today's Close Price using SARIMA ---
import warnings
warnings.filterwarnings('ignore')
# Fit on FULL dataset
full_sarima_model = SARIMAX(series,
                            order=order,
                            seasonal_order=seasonal_order,
                            enforce_stationarity=False,
                            enforce_invertibility=False)
full_sarima_result = full_sarima_model.fit()

# Forecast next step
today_forecast = sarima_result.forecast(steps=1)
predicted_today_close = today_forecast.values[0]

# Actual today close (from today's stored data)
actual_today_close = today_data['Close'].item()

# Accuracy
sarima_accuracy = (1 - abs(actual_today_close - predicted_today_close) / actual_today_close) * 100

print("Today's Close Prediction Results with SARIMA")
print(f"Predicted Close Value (SARIMA): {predicted_today_close:.2f}")
print(f"Actual Close Value: {actual_today_close:.2f}")
print(f"Prediction Accuracy: {sarima_accuracy:.2f}%")


In [None]:
# Step 0: Sanitize y_test and y_pred
def generate_signals(y_true, y_pred, threshold=0.01):
    signals = []
    for actual, pred in zip(y_true, y_pred):
        change = (pred - actual) / actual
        if change > threshold:
            signals.append('Buy')
        elif change < -threshold:
            signals.append('Sell')
        else:
            signals.append('Hold')
    return signals

# Step 2: Make Predictions and Flatten if Needed


y_true = y_test.values.flatten()
y_pred = best_model.predict(X_test)
if len(y_pred.shape) > 1:
    y_pred = y_pred.flatten()

# Match lengths if needed
min_length = min(len(y_true), len(y_pred))
y_true = y_true[:min_length]
y_pred = y_pred[:min_length]

# Step 1: Generate signals
signals = generate_signals(y_true, y_pred)

# Step 2: Build the DataFrame
df_signals = pd.DataFrame({
    'Actual_Close': y_true,
    'Predicted_Close': y_pred,
    'Signal': signals
})
df_signals.index = y_test.index[:min_length]  # Safe indexing
def backtest_strategy(df, initial_cash=1000):
    cash = initial_cash
    shares = 0
    portfolio_values = []

    for idx, row in df.iterrows():
        signal = row['Signal']
        price = row['Actual_Close']

        if signal == 'Buy' and cash > 0:
            shares = cash / price
            cash = 0

        elif signal == 'Sell' and shares > 0:
            cash = shares * price
            shares = 0

        portfolio_value = cash + shares * price
        portfolio_values.append(portfolio_value)

    return portfolio_values

# Step 5: Run Backtest
portfolio_values = backtest_strategy(df_signals)
df_signals['Portfolio_Value'] = portfolio_values

# Step 6: Show Final Results
final_value = portfolio_values[-1]
total_return = (final_value - 1000) / 1000 * 100
print(f"\nFinal Portfolio Value: ₹{final_value:.2f}")
print(f" Total Strategy Return: {total_return:.2f}%")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14,6))
plt.plot(df_signals.index, df_signals['Portfolio_Value'], label='Portfolio Value', color='blue')
plt.title('Portfolio Growth Over Time')
plt.xlabel('Date')
plt.ylabel('Portfolio Value (₹)')
plt.legend()
plt.grid(True)
plt.show()

# Step 8: Plot Signal Distribution
import seaborn as sns

sns.countplot(x='Signal', data=df_signals)
plt.title('Distribution of Buy/Sell/Hold Signals')
plt.grid(True)
plt.show()

In [None]:

import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Make sure you have the original Date
# Assuming 'df_raw' is available and it has the 'Date' column
# If not, you must reload your original data with dates!

# Step 2: Get last known Date
last_date = pd.to_datetime(df_copy['Date'].iloc[-1])  # Last available real date

# Step 3: Forecast next 30 business days (using SARIMA)
# 'full_sarima_result' must already be trained
future_forecast = full_sarima_result.forecast(steps=365)

# Step 4: Create 30 future business days
future_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=365, freq='B')

# Step 5: Organize Forecast into DataFrame
future_forecast_df = pd.DataFrame({
    'Date': future_dates,
    'Predicted_Close': future_forecast.values
})
future_forecast_df.set_index('Date', inplace=True)

# Step 6: Plot Historical and Future Prices Together
plt.figure(figsize=(14,6))

# Plot historical close prices
plt.plot(df_copy['Date'], df_copy['Close'], label='Historical Close Prices')

# Plot future forecasted prices
plt.plot(future_forecast_df.index, future_forecast_df['Predicted_Close'], label='365-Day Forecast', color='green')

plt.title('Future 365 Days Stock Price Forecast')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.legend()
plt.grid(True)
plt.show()

# Step 7: Show the Forecast Table
print("\n365-Day Future Stock Price Forecast:")
print(future_forecast_df.head())
print("\n",future_forecast_df.tail())