<a href="https://colab.research.google.com/github/Rinch1/stock-predictions/blob/main/Module_1_Project_Data_Science.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The code below shows us how to mount and allow google collab to acess your google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!ls "/content/drive/My Drive/Colab Notebooks"

In [None]:
#path = "/content/drive/My Drive/Colab Notebooks/RELIANCE.csv"

**IMPORT NECCESSARY LIBRARIES**

Read your data and do preprocessing

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
import yfinance as yf
reliance = yf.download("RELIANCE.NS", start="2014-01-01", end="2024-12-31")
df = reliance
df.head()

In [None]:
df.tail()

In [None]:
df.shape

Checking for null values

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df

make the Date column as an index column

In [None]:
# Changes The Date column as index columns
#df.index = pd.to_datetime(df['Date'])
#df

# drop The original date column
#df = df.drop(['Date'], axis='columns')
#df


In [None]:
df_copy=df.copy(deep=True)

In [None]:
df_copy.skew()

In [None]:
sns.set_style("whitegrid")

numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns

plt.figure(figsize=(14, len(numerical_columns) * 3,), facecolor="gold")
for idx, feature in enumerate(numerical_columns, 1):
    plt.subplot(len(numerical_columns), 2, idx)
    sns.histplot(df[feature], kde=True)
    plt.title(f"{feature} Distribution | skewness: {round (df[feature].skew(), 2)}")
    plt.xlabel(feature)
    plt.ylabel("Frequency (Density)")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

In [None]:
sqrt_df=np.sqrt(df_copy)
sqrt_df.skew()

In [None]:
sqrt_df=np.sqrt(df_copy["Volume"])
sqrt_df.skew()

In [None]:
cbrt=np.cbrt(df_copy["Volume"])
cbrt.skew()

**TIMESERIES CHART FOR OPEN PRICE**

In [None]:
df['Open'].plot(kind='line', figsize=(8, 4), title='Open', color="red")
plt.gca().spines[['top', 'right']].set_visible(False)


In [None]:
df['Close'].plot(kind='line', figsize=(8, 4), title='Close')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
df.columns

**Feature Engineering**:

* Lagged values (Close_1, Close_2) and moving averages (MA_5, MA_20) are added as features. This is crucial for time series prediction.

* A "lag" represents a time delay.

 So, a "lagged value" is a value from an earlier point in time.

* For example, if you have a time series of daily stock prices:
    * A "lag-1" value for today's price would be yesterday's price.
    * A "lag-2" value would be the price from two days ago, and so on.

In [None]:
df['Close_1'] = df['Close'].shift(1)
df['Close_2'] = df['Close'].shift(2)
df['MA_5'] = df['Close'].rolling(window=5).mean()
df['MA_20'] = df['Close'].rolling(window=20).mean()
df = df.dropna()

In [None]:
df

In [None]:

# Original time series
plt.figure(figsize=(10, 6))
plt.plot(df['Close'], label="Original data")
plt.title('Original Time Series (Non-Stationary)')
plt.legend()
plt.show()

# Function to perform ADF test

def adf_test(series, title='ADF Test'):
    result = adfuller(series.dropna(), autolag='AIC')
    print(f'{title}:')
    print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    for key, value in result[4].items():
        print(f'Critical Value ({key}): {value:.3f}')

adf_test(df['Close'], title='ADF Test on Original Data')

# First-order differencing
data_diff = df.diff().dropna()

# Plot differenced data
plt.figure(figsize=(10, 6))
plt.plot(data_diff['Close'], label='First-Order Differenced Data')
plt.title('First-Order Differenced Time Series (Potentially Stationary)')
plt.legend()
plt.show()

In [None]:
results = adfuller(data_diff['Close'])
print("p-value:", results[1])
if results[1] < 0.05:
    print("Data is stationary")
else:
    print("Data is not stationary")

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt

# Sample non-stationary time series data (replace with your data)
#df = pd.Series([i**2 for i in range(100)])

# Original time series
plt.figure(figsize=(10, 6))
plt.plot(df, label="Original data")
plt.title('Original Time Series (Non-Stationary)')
plt.legend()
plt.show()

# Function to perform ADF test

def adf_test(series, title='ADF Test'):
    result = adfuller(series.dropna(), autolag='AIC')
    print(f'{title}:')
    #print(f'ADF Statistic: {result[0]}')
    print(f'p-value: {result[1]}')
    #for key, value in result[4].items():
        #print(f'Critical Value ({key}): {value:.3f}')
        #print('\n')

numerical_cols = df.select_dtypes(include=['number']).columns
for col in numerical_cols:
    results = adfuller(df[col])
    adf_test(df[col], title=f'ADF Test on {col}')
    if results[1] < 0.05:
      print("Data is stationary")
    else:
      print("Data is not stationary")
    #adf_test(df[col], title=f'ADF Test on {col}')
#adf_test(df, title='ADF Test on Original Data')

# First-order differencing
data_diff = df.diff().dropna()

# Plot differenced data
plt.figure(figsize=(10, 6))
plt.plot(data_diff, label='First-Order Differenced Data')
plt.title('First-Order Differenced Time Series (Potentially Stationary)')
plt.legend()
plt.show()

In [None]:
print("p-value:", results[1])
numerical_cols = df.select_dtypes(include=['number']).columns
for col in numerical_cols:
    results = adfuller(data_diff[col])
    adf_test(data_diff[col], title=f'ADF Test on {col}')
    if results[1] < 0.05:
      print("Data is stationary")
    else:
      print("Data is not stationary")

**DEFINING TARGET VARIABLE**

In [None]:
#y = df['Close']
#X = df.drop('Close', axis=1)

In [None]:
y=data_diff['Close']
X=data_diff.drop('Close', axis=1)

SCALLING USING MinMaxScaller

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

**Time Series Split**:

* The data is split into training and testing sets while preserving the time series order. This is essential to prevent lookahead bias.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

**MODEL TRAINING**

* The code now trains and evaluates linear regression models

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

**MODEL EVALUATION**


In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [None]:
print(f"Linear Regression -\n RMSE: {rmse:.2f},\n R2: {r2:.2f},\n mse:{mse:.2}")

Linear regression using statsmodels

In [None]:
import statsmodels.api as sm

X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

model_sm = sm.OLS(y_train, X_train_sm).fit()

print(model_sm.summary())

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test, label="Actual Prices", color="red")
plt.plot(y_test.index, y_pred, label="Predicted Prices", color="green")
plt.legend()
plt.title("Reliance Stock Price Prediction (Linear Regression)")
plt.xlabel("Date")
plt.ylabel("Price")
plt.show()
#print(y_pred)

In [None]:
import yfinance as yf
import pandas as pd

# Get latest data
new_data = yf.download("RELIANCE.NS", start="2025-03-01", end="2025-04-03")
#new_data.head()
# Add lagged values and Moving Avarages
new_data['Close_1'] = new_data['Close'].shift(-1)  # Lag 1
new_data['Close_2'] = new_data['Close'].shift(-2)  # Lag 2
new_data['MA_5'] = new_data['Close'].rolling(window=5).mean()
new_data['MA_20'] = new_data['Close'].rolling(window=2).mean()
#new_data = new_data.dropna()
new_data.head()

In [None]:
df.tail(2)

In [None]:
print([X.columns])

In [None]:
prediction_date = '2025-04-02'
data_to_predict = new_data[new_data.index == prediction_date]

In [None]:
y = new_data['Close']
X = new_data.drop('Close', axis=1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#scaler.fit(X_train)
y = new_data['Close']
X = new_data.drop('Close', axis=1)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
#scaler.fit(X_train)
#X_scaled = scaler.fit_transform(X)
new_data_scaled = scaler.fit_transform(new_data[X.columns])
features_for_prediction = X.columns # Replace with your actual feature columns
data_to_predict_features = data_to_predict[features_for_prediction]

In [None]:
data_to_predict_scaled = scaler.transform(data_to_predict_features)
predictions = model.predict(data_to_predict_scaled)
'''
predictions_reshaped = np.zeros((1, 8))
predictions_reshaped[:, 0] = predictions
future_predictions = scaler.inverse_transform(predictions.reshape)[:,0]
'''

In [None]:
predicted_value = predictions[0]
print(f"Predicted value for {prediction_date}: {predicted_value}")

NameError: name 'predictions' is not defined

In [None]:
!pip install gradio

In [None]:
# Install required libraries
!pip install gradio joblib numpy pandas scikit-learn

# Import necessary libraries
import gradio as gr
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Upload the dataset
from google.colab import files
uploaded = files.upload()  # This will prompt you to upload the reliance.csv file

# Load the dataset
data = pd.read_csv('reliance.csv')  # Ensure the file name matches

# Preprocess the data
# Assuming 'Close' is the target variable and the rest are features
X = data.drop(['Close'], axis=1)  # Features
y = data['Close']                  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'reliance_model.joblib')

# Load the trained model
model = joblib.load('reliance_model.joblib')

# Define a function to make predictions
def predict(features):
    # Convert the input features to a numpy array
    features_array = np.array(features).reshape(1, -1)

    # Make a prediction
    prediction = model.predict(features_array)

    return prediction[0]

# Define the Gradio interface
def create_interface():
    # Assuming the features are numerical and you know their order
    feature_names = ['Feature1', 'Feature2', 'Feature3', 'Feature4']  # Replace with actual feature names
    inputs = [gr.inputs.Number(label=name) for name in feature_names]

    interface = gr.Interface(
        fn=predict,
        inputs=inputs,
        outputs="number",
        title="Reliance Stock Price Prediction",
        description="Enter the feature values to predict the stock price."
    )

    return interface

# Launch the Gradio app
interface = create_interface()
interface.launch(share=True)  # Use share=True to get a public link




FileNotFoundError: [Errno 2] No such file or directory: 'reliance.csv'