In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('financial_regression.csv')
df.head()

Unnamed: 0,date,sp500 open,sp500 high,sp500 low,sp500 close,sp500 volume,sp500 high-low,nasdaq open,nasdaq high,nasdaq low,...,palladium high,palladium low,palladium close,palladium volume,palladium high-low,gold open,gold high,gold low,gold close,gold volume
0,2010-01-14,114.49,115.14,114.42,114.93,115646960.0,0.72,46.26,46.52,46.22,...,45.02,43.86,44.84,364528.0,1.16,111.51,112.37,110.79,112.03,18305238.0
1,2010-01-15,114.73,114.84,113.2,113.64,212252769.0,1.64,46.46,46.55,45.65,...,45.76,44.4,45.76,442210.0,1.36,111.35,112.01,110.38,110.86,18000724.0
2,2010-01-18,,,,,,,,,,...,,,,,,,,,,
3,2010-01-19,113.62,115.13,113.59,115.06,138671890.0,1.54,45.96,46.64,45.95,...,47.08,45.7,46.94,629150.0,1.38,110.95,111.75,110.83,111.52,10467927.0
4,2010-01-20,114.28,114.45,112.98,113.89,216330645.0,1.47,46.27,46.604,45.43,...,47.31,45.17,47.05,643198.0,2.14,109.97,110.05,108.46,108.94,17534231.0


In [3]:
df.columns

Index(['date', 'sp500 open', 'sp500 high', 'sp500 low', 'sp500 close',
       'sp500 volume', 'sp500 high-low', 'nasdaq open', 'nasdaq high',
       'nasdaq low', 'nasdaq close', 'nasdaq volume', 'nasdaq high-low',
       'us_rates_%', 'CPI', 'usd_chf', 'eur_usd', 'GDP', 'silver open',
       'silver high', 'silver low', 'silver close', 'silver volume',
       'silver high-low', 'oil open', 'oil high', 'oil low', 'oil close',
       'oil volume', 'oil high-low', 'platinum open', 'platinum high',
       'platinum low', 'platinum close', 'platinum volume',
       'platinum high-low', 'palladium open', 'palladium high',
       'palladium low', 'palladium close', 'palladium volume',
       'palladium high-low', 'gold open', 'gold high', 'gold low',
       'gold close', 'gold volume'],
      dtype='object')

In [4]:
# Drop sparse and non-numeric columns
df = df.drop(columns=['date','sp500 open', 'sp500 high', 'sp500 low', 'sp500 close',
       'sp500 volume', 'sp500 high-low', 'nasdaq open', 'nasdaq high',
       'nasdaq low', 'nasdaq close', 'nasdaq volume', 'nasdaq high-low','silver open',
       'silver high', 'silver low', 'silver close', 'silver volume',
       'silver high-low', 'oil open', 'oil high', 'oil low', 'oil close',
       'oil volume', 'oil high-low', 'platinum open', 'platinum high',
       'platinum low', 'platinum close', 'platinum volume',
       'platinum high-low', 'palladium open', 'palladium high',
       'palladium low', 'palladium close', 'palladium volume',
       'palladium high-low','us_rates_%','CPI','GDP'])

In [5]:
df.head()

Unnamed: 0,usd_chf,eur_usd,gold open,gold high,gold low,gold close,gold volume
0,1.0206,1.4478,111.51,112.37,110.79,112.03,18305238.0
1,1.0264,1.4376,111.35,112.01,110.38,110.86,18000724.0
2,,,,,,,
3,1.034,1.4269,110.95,111.75,110.83,111.52,10467927.0
4,1.0453,1.4094,109.97,110.05,108.46,108.94,17534231.0


In [6]:
# Drop rows with missing target
df = df.dropna(subset=['gold close'])
df.head()

Unnamed: 0,usd_chf,eur_usd,gold open,gold high,gold low,gold close,gold volume
0,1.0206,1.4478,111.51,112.37,110.79,112.03,18305238.0
1,1.0264,1.4376,111.35,112.01,110.38,110.86,18000724.0
3,1.034,1.4269,110.95,111.75,110.83,111.52,10467927.0
4,1.0453,1.4094,109.97,110.05,108.46,108.94,17534231.0
5,1.0426,1.4106,108.48,108.78,106.61,107.37,25747831.0


In [7]:
df["usd_chf"]=df["usd_chf"].fillna(df["usd_chf"].median())
df["eur_usd"]=df["eur_usd"].fillna(df["eur_usd"].median())

In [8]:
#After Applying of media to Checking an missing values
print(df.isnull().sum())

usd_chf        0
eur_usd        0
gold open      0
gold high      0
gold low       0
gold close     0
gold volume    0
dtype: int64


In [9]:
df = df.fillna(df.median(numeric_only=True))
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3719 entries, 0 to 3903
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   usd_chf      3719 non-null   float64
 1   eur_usd      3719 non-null   float64
 2   gold open    3719 non-null   float64
 3   gold high    3719 non-null   float64
 4   gold low     3719 non-null   float64
 5   gold close   3719 non-null   float64
 6   gold volume  3719 non-null   float64
dtypes: float64(7)
memory usage: 232.4 KB


In [10]:
df.duplicated().sum()

0

In [11]:
X = df.drop(['gold close'],axis = 1)

In [12]:
y = df['gold close']

In [13]:
num_cols = ['usd_chf','eur_usd', 'gold open', 'gold high', 'gold low', 'gold close',
       'gold volume']

In [14]:
len(num_cols)

7

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [16]:
# Select numerical columns
num_cols = X.columns.tolist()

In [17]:
# Define pipeline
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [18]:
# Combine into column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols)
])

In [19]:
num_pipeline

In [20]:
preprocessor

In [21]:
# Apply the pipeline to these columns
ct = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols)
])

In [22]:
ct

In [23]:
from sklearn.neighbors import KNeighborsRegressor

In [24]:
final_pl = Pipeline(steps = [("ct",num_pipeline),
                        ("estimator",KNeighborsRegressor())])

In [25]:
final_pl

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 45)

In [27]:
final_pl.fit(X_train,y_train)

In [28]:
y_pred = final_pl.predict(X_test)

In [29]:
y_pred

array([126.19   , 161.034  , 118.53   , 181.37   , 127.08198, 180.012  ,
       103.906  , 167.902  , 117.63   , 119.094  , 166.7    , 231.648  ,
       116.248  , 172.884  , 158.396  , 121.828  , 120.848  , 119.186  ,
       174.27   , 115.87   , 180.814  , 122.948  , 121.298  , 107.6796 ,
       160.73208, 123.648  , 160.38   , 182.192  , 147.872  , 114.61   ,
       164.292  , 170.682  , 160.568  , 170.246  , 119.03   , 103.896  ,
       123.45   , 154.62788, 117.812  , 164.678  , 139.984  , 116.374  ,
       124.97798, 155.032  , 135.316  , 125.3642 , 121.96   , 121.854  ,
       114.976  , 165.77316, 168.442  , 178.128  , 176.078  , 161.126  ,
       108.542  , 108.656  , 157.918  , 124.86   , 125.776  , 126.116  ,
       160.35   , 178.212  , 147.806  , 132.7065 , 231.648  , 144.314  ,
       111.368  , 133.58   , 167.72   , 215.23   , 214.254  , 183.6    ,
       221.296  , 167.318  , 166.572  , 113.418  , 127.384  , 111.36   ,
       177.274  , 163.35   , 166.008  , 115.272  , 

In [30]:
r2_score(y_test,y_pred)

0.994976287776258

In [31]:
### pickling the final_pl object
import pickle

with open("goldPrediction.pkl","wb") as f:
    pickle.dump(final_pl,f)

In [32]:
with open("goldPrediction.pkl","rb") as f:
    model = pickle.load(f)

In [33]:
X.head()

Unnamed: 0,usd_chf,eur_usd,gold open,gold high,gold low,gold volume
0,1.0206,1.4478,111.51,112.37,110.79,18305238.0
1,1.0264,1.4376,111.35,112.01,110.38,18000724.0
3,1.034,1.4269,110.95,111.75,110.83,10467927.0
4,1.0453,1.4094,109.97,110.05,108.46,17534231.0
5,1.0426,1.4106,108.48,108.78,106.61,25747831.0


In [41]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# 1️⃣ Handle missing values first
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# 2️⃣ Scale after imputing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# 3️⃣ Models to compare
models = {
    "Linear Regression": LinearRegression(),
    "KNeighbors Regressor": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# 4️⃣ Collect results
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2_test = r2_score(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)

    results.append({
        "Model": name,
        "MAE": round(mae, 3),
        "RMSE": round(rmse, 3),
        "R2 Score (Test)": round(r2_test, 3),
        "R2 Score (Train)": round(r2_train, 3)
    })

# 5️⃣ Create and sort performance table
performance_table = pd.DataFrame(results).sort_values(by="R2 Score (Test)", ascending=False).reset_index(drop=True)
print(performance_table)


                      Model    MAE   RMSE  R2 Score (Test)  R2 Score (Train)
0             Random Forest  1.374  2.318            0.994             0.999
1             Decision Tree  1.770  2.985            0.991             1.000
2         Gradient Boosting  2.478  3.529            0.987             0.993
3      KNeighbors Regressor  2.587  4.291            0.980             0.988
4  Support Vector Regressor  5.197  8.494            0.923             0.935
5         Linear Regression  6.772  8.985            0.914             0.921


In [43]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

# 1️⃣ Handle missing values
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# 2️⃣ Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# 3️⃣ Define models
models = {
    "Linear Regression": LinearRegression(),
    "KNeighbors Regressor": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42, n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}

# 4️⃣ Collect performance metrics
results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_test_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    r2_test = r2_score(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)

    results.append({
        "Model": name,
        "MAE": round(mae, 3),
        "RMSE": round(rmse, 3),
        "R² (Test)": round(r2_test, 3),
        "R² (Train)": round(r2_train, 3)
    })

# 5️⃣ Create performance report
performance_report = pd.DataFrame(results).sort_values(by="R² (Test)", ascending=False).reset_index(drop=True)

print("\n📊 Regression Performance Report:")
print(performance_report)



📊 Regression Performance Report:
                      Model    MAE   RMSE  R² (Test)  R² (Train)
0             Random Forest  1.374  2.318      0.994       0.999
1             Decision Tree  1.770  2.985      0.991       1.000
2         Gradient Boosting  2.478  3.529      0.987       0.993
3      KNeighbors Regressor  2.587  4.291      0.980       0.988
4  Support Vector Regressor  5.197  8.494      0.923       0.935
5         Linear Regression  6.772  8.985      0.914       0.921
