In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pickle



In [2]:

# Step 1: Load and Preprocess Datasets

# Load stock_price_dataset.csv
stock_price_df = pd.read_csv('stock_price_dataset.csv')

# Load upload_DJIA_table.csv
djia_df = pd.read_csv('upload_DJIA_table.csv')

# Rename columns to ensure consistency (if needed)
stock_price_df.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
djia_df.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)



In [4]:
# Specify the correct datetime format explicitly
stock_price_df['Date'] = pd.to_datetime(stock_price_df['Date'], format='%y-%m-%d')
djia_df['Date'] = pd.to_datetime(djia_df['Date'], format='%Y-%m-%d')


In [5]:
# Step 3: Merge datasets (if necessary)

# Merge stock_price_df with djia_df on 'Date' column (assuming they contain related information)
merged_df = pd.merge(stock_price_df, djia_df, on='Date', how='outer', suffixes=('_stock', '_djia'))
# Display information about merged DataFrame


In [6]:
# Extract date-related features
merged_df['Day'] = merged_df['Date'].dt.day
merged_df['Month'] = merged_df['Date'].dt.month
merged_df['Year'] = merged_df['Date'].dt.year

In [7]:

# Step 4: Prepare Data for Model Training

# Select relevant features and target variable (Close price)
features = ['Open_stock', 'High_stock', 'Low_stock', 'Volume_stock','Close_stock','Year','Month','Day']
target = 'Close_stock'  # Target variable

X = merged_df[features]
y = merged_df[target]


In [8]:
# Filter DataFrame to exclude rows with NaN values in selected columns
merged_df_clean = merged_df.dropna(subset=features + [target])
print(X.shape)  # Should not be (0, num_features)
print(y.shape)  # Should not be (0,)



(6680, 8)
(6680,)


In [9]:

# Step 5: Split Data into Training and Test Sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)



In [10]:
from sklearn.impute import SimpleImputer

# Create an imputer instance
imputer = SimpleImputer(strategy='mean')  # You can also use 'median' or 'most_frequent'

# Fit the imputer on X_train to learn the imputation parameters
imputer.fit(X_train)

# Transform X_train and X_test using the learned imputation parameters
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)


In [11]:
# Drop rows with NaN values from X_train and align y_train accordingly
X_train_clean = X_train.dropna()
y_train_clean = y_train.loc[X_train_clean.index]  # Match y_train indices with cleaned X_train indices


In [14]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Assuming you have defined `X` and `y` with features and target
features = ['Open_stock', 'High_stock', 'Low_stock', 'Volume_stock','Year','Month','Day']
target = 'Close_stock'

# Combine features and target into a single DataFrame
data = pd.concat([X, y], axis=1)

# Drop rows with NaN values in the target column (Close_stock)
data_clean = data.dropna(subset=[target])

# Split data into features (X) and target (y) again after cleaning
X_clean = data_clean[features]
y_clean = data_clean[target]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.3, random_state=42)

# Define preprocessing steps within a pipeline
preprocessor = ColumnTransformer([
    ('imputer', SimpleImputer(strategy='mean'), features)  # Impute missing values in specified features
])

# Define the full pipeline including preprocessing and model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline on X_train and y_train
model.fit(X_train, y_train)

# Make predictions on the filtered test set
y_pred = model.predict(X_test)

# Print actual and predicted values for comparison
print("Actual:")
print(y_test[:5])  # Print first 5 actual values
print("\nPredicted:")
print(y_pred[:5])  # Print first 5 predicted values



pickle_out = open('predict_talha.pkl','wb')
pickle.dump(model,pickle_out)
pickle_out.close()


# Calculate R-squared scores
test_r2 = r2_score(y_test, y_pred)

print(f"Test R-squared: {test_r2:.4f}")



Actual:
      Close_stock  Close_stock
1501       103.28       103.28
2586        78.83        78.83
2653       136.87       136.87
1055        53.80        53.80
705         56.55        56.55

Predicted:
[[106.25606552 106.25606552]
 [ 78.79810927  78.79810927]
 [137.44301063 137.44301063]
 [ 51.98596264  51.98596264]
 [ 57.57880823  57.57880823]]
Test R-squared: 0.9988


In [15]:

# Calculate R-squared scores
test_r2 = r2_score(y_test, y_pred)

print(f"Test R-squared: {test_r2:.4f}")


Test R-squared: 0.9988
