In [1]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print('Train Dataset Head:')
print(train_df.head())
print('Test Dataset Head:')
print(test_df.head())

In [2]:
train_df.dtypes

ID                     int64
Company               object
Quarter               object
QuickRatio           float64
InventoryRatio       float64
RevenueGrowth        float64
MarketshareChange    float64
Bond rating           object
Stock rating          object
Region                object
Industry              object
Sales                float64
dtype: object

In [3]:
test_df.dtypes

ID                     int64
Company               object
Quarter               object
QuickRatio           float64
InventoryRatio       float64
RevenueGrowth        float64
MarketshareChange    float64
Bond rating           object
Stock rating          object
Region                object
Industry              object
dtype: object

In [4]:
m_values_train = train_df.isnull().sum()
m_values_test = test_df.isnull().sum()

print('Missing Values in Train:')
print(m_values_train)
print('\
Missing Values in Test:')
print(m_values_test)

Missing Values in Train:
ID                     0
Company                0
Quarter                0
QuickRatio             0
InventoryRatio       152
RevenueGrowth          0
MarketshareChange      0
Bond rating            0
Stock rating           0
Region                 0
Industry               0
Sales                150
dtype: int64
Missing Values in Test:
ID                    0
Company               0
Quarter               0
QuickRatio            0
InventoryRatio       32
RevenueGrowth         0
MarketshareChange     0
Bond rating           0
Stock rating          0
Region                0
Industry              0
dtype: int64


In [5]:
from sklearn.impute import SimpleImputer

# Imputing missing values in 'InventoryRatio' with the median
imputer = SimpleImputer(strategy='median')
train_df['InventoryRatio'] = imputer.fit_transform(train_df[['InventoryRatio']])

# Dropping rows where 'Sales' is missing
train_df.dropna(subset=['Sales'], inplace=True)

# Checking again for missing values to ensure the cleaning was successful
df_missing_values_cleaned = train_df.isnull().sum()
print(df_missing_values_cleaned[df_missing_values_cleaned > 0])

Series([], dtype: int64)


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Clean the data
# Drop rows where Sales is missing in the training data
train_df.dropna(subset=['Sales'], inplace=True)

# Replace missing values in 'InventoryRatio' with the mean for both train and test data
train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)
test_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)

# Select features
X_train = train_df.drop(['Sales'], axis=1).select_dtypes(exclude=['object'])
y_train = train_df['Sales']
X_test = test_df.select_dtypes(exclude=['object'])

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
predictions = model.predict(X_test)

# Since we don't have actual sales data for the test set, we'll evaluate the model using the training set
pred_train = model.predict(X_train)
mae = mean_absolute_error(y_train, pred_train)
print('MAE:', mae)
submission_df = test_df[['ID']].copy()
submission_df['Sales'] = predictions

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission3.csv', index=False)

print('Submission file saved as submission.csv')

MAE: 1555.239284298314


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Clean the data
train_df.dropna(subset=['Sales'], inplace=True)
train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)
test_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)


# Select features
X_train = train_df.drop(['Sales'], axis=1).select_dtypes(exclude=['object'])
y_train = train_df['Sales']
X_test = test_df.select_dtypes(exclude=['object'])

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model (using Ridge Regression as an example)
model = Ridge(alpha=1.0)  # You can tune the alpha parameter
model.fit(X_train_scaled, y_train)

# Predict on training data for evaluation
pred_train = model.predict(X_train_scaled)
mae = mean_absolute_error(y_train, pred_train)
print('MAE on training data:', mae)

# Predict on test data
predictions = model.predict(X_test_scaled)


MAE on training data: 1555.240578147828


In [8]:

##2nd submission

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy.stats.mstats import winsorize

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Clean the data
# Drop rows where Sales is missing in the training data
train_df.dropna(subset=['Sales'], inplace=True)

# Identify outliers in 'Sales' using z-score
z_scores = (train_df['Sales'] - train_df['Sales'].mean()) / train_df['Sales'].std()
outliers = train_df[abs(z_scores) > 3]  # Considering values with z-score > 3 as outliers

# Winsorize 'Sales' at the 95th percentile
train_df['Sales'] = winsorize(train_df['Sales'], limits=(0, 0.05))

# Replace missing values in 'InventoryRatio' with the mean for both train and test data
train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)
test_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)

# Select features
X_train = train_df.drop(['Sales'], axis=1).select_dtypes(exclude=['object'])
y_train = train_df['Sales']
X_test = test_df.select_dtypes(exclude=['object'])

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test data
predictions = model.predict(X_test)


pred_train = model.predict(X_train)
mae = mean_absolute_error(y_train, pred_train)
print('MAE:', mae)


MAE: 1470.687179082042


In [None]:
## 3rd submission
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Clean the data
# Drop rows where Sales is missing in the training data
train_df.dropna(subset=['Sales'], inplace=True)

# Replace missing values in 'InventoryRatio' with the mean for both train and test data
train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)
test_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)

# Select features
X = train_df.drop(['Sales'], axis=1).select_dtypes(exclude=['object'])
y = train_df['Sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation data
pred_val = model.predict(X_val)
mae_val = mean_absolute_error(y_val, pred_val)
print('Validation MAE:', mae_val)

# Predict on test data
X_test = test_df.select_dtypes(exclude=['object'])
predictions = model.predict(X_test)

# Evaluate on training data
pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, pred_train)
print('Training MAE:', mae_train)


In [9]:
## 4th submission
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from scipy.stats.mstats import winsorize

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Clean the data
# Drop rows where Sales is missing in the training data
train_df.dropna(subset=['Sales'], inplace=True)

# Identify outliers in 'Sales' using z-score
z_scores = (train_df['Sales'] - train_df['Sales'].mean()) / train_df['Sales'].std()
outliers = train_df[abs(z_scores) > 3]  # Considering values with z-score > 3 as outliers

# Winsorize 'Sales' at the 95th percentile
train_df['Sales'] = winsorize(train_df['Sales'], limits=(0, 0.05))

# Replace missing values in 'InventoryRatio' with the mean for both train and test data
train_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)
test_df['InventoryRatio'].fillna(train_df['InventoryRatio'].mean(), inplace=True)

# Select features
X = train_df.drop(['Sales'], axis=1).select_dtypes(exclude=['object'])
y = train_df['Sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on training and validation data
pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, pred_train)
print('Training MAE:', mae_train)

pred_val = model.predict(X_val)
mae_val = mean_absolute_error(y_val, pred_val)
print('Validation MAE:', mae_val)


Training MAE: 1424.3035284119621
Validation MAE: 1658.4919697584153


In [None]:
## Tried Different methods and Tried gradient boosting

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load and clean the training data
df_train = pd.read_csv('train.csv')

# Separate features and target variable
X = df_train.drop('Sales', axis=1)
y = df_train['Sales']

# Handle missing values in target variable (example using mean imputation)
imputer = SimpleImputer(strategy='mean')
y = imputer.fit_transform(y.values.reshape(-1, 1))[:, 0]


# Preprocessing steps
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')



# Load the test data
df_test = pd.read_csv('test.csv')

# Predict using the model
y_test_pred = model.predict(df_test)  # Ensure this line is executed after model training

# Create submission DataFrame
submission_df = pd.DataFrame({'ID': df_test.ID, 'Sales': y_test_pred})

# Save the submission file
submission_df.to_csv('final_submission.csv', index=False)
print('Submission file saved successfully.')


MAE: 898.7637723179258
Submission file saved successfully.


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Input, LSTM, Dense, Reshape
from tensorflow.keras.regularizers import l2  # Import
from sklearn.model_selection import cross_val_score
indicators = pd.read_csv('EconomicIndicators.csv')
indicators.head()

Unnamed: 0,Month,Consumer Sentiment,Interest Rate,PMI,Money Supply,NationalEAI,EastEAI,WestEAI,SouthEAI,NorthEAI
0,1,67.2,1.5385,55.5,20847.8,57.083078,56.512247,54.628506,56.512247,57.083078
1,2,62.8,1.811579,57.3,20964.3,47.496553,45.454201,47.021588,45.454201,47.496553
2,3,59.4,2.10913,58.8,21115.6,41.697385,39.904398,42.656425,41.280411,41.697385
3,4,65.2,2.7775,59.2,21315.8,48.503429,46.417782,46.417782,48.018395,43.653086
4,5,58.4,2.874286,57.0,21549.3,41.535949,38.379217,39.749903,42.491276,49.843138


In [12]:
indicators = pd.read_csv('EconomicIndicators.csv')
indicators = indicators.drop(['EastEAI', 'WestEAI', 'NorthEAI', 'SouthEAI'], axis = 1)
indicators['Quarter'] = pd.cut(indicators['Month'], bins=[0, 3, 6, 9, 12, 15, 18, 21, 24, 27], labels=['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9'])
indicators = indicators.groupby('Quarter').mean().drop('Month', axis=1)
indicators = indicators.reset_index()
indicators['Quarter'] = indicators['Quarter'].astype(str)
indicators

Unnamed: 0,Quarter,Consumer Sentiment,Interest Rate,PMI,Money Supply,NationalEAI
0,Q1,63.133333,1.819736,57.2,20975.9,48.759005
1,Q2,57.866667,2.947262,56.3,21475.8,38.324704
2,Q3,56.1,3.229186,51.9,21648.566667,36.429408
3,Q4,58.8,3.999262,48.1,21678.4,41.830051
4,Q5,64.6,3.802861,47.8,21539.333333,51.604474
5,Q6,62.3,3.692629,48.3,21326.433333,47.580357
6,Q7,69.6,4.311674,48.9,20893.733333,58.820032
7,Q8,64.933333,4.421024,49.2,20846.3,53.884995
8,Q9,69.9,3.945484,50.033333,20768.8,61.509561


In [13]:
##10th Submission

# Load and clean the training data
df_train = pd.read_csv('train.csv')

# Separate features and target variable
X = df_train.drop('Sales', axis=1)
y = df_train['Sales']

# Handle missing values in target variable (example using median imputation)
imputer = SimpleImputer(strategy='median')
y = imputer.fit_transform(y.values.reshape(-1, 1))[:, 0]


# Preprocessing steps
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Merge with Economic data
X_train = pd.merge(X_train, indicators, left_on='Quarter', right_on='Quarter', how='left')
X_test = pd.merge(X_test, indicators, left_on='Quarter', right_on='Quarter', how='left')

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

y_train_pred = model.predict(X_train)

# Calculate train MAE
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f'Train MAE: {train_mae}')
# # Load the test data
df_test = pd.read_csv('test.csv')
df_test = pd.merge(df_test, indicators, left_on='Quarter', right_on='Quarter', how='left')

# Predict using the model
y_test_pred = model.predict(df_test)  # Ensure this line is executed after model training

# Create submission DataFrame
submission_df = pd.DataFrame({'ID': df_test.ID, 'Sales': y_test_pred})

# Save the submission file
submission_df.to_csv('final_submission.csv', index=False)
print('Submission file saved successfully.')


MAE: 865.9460388855515
Train MAE: 787.8099615368009
Submission file saved successfully.


In [14]:
from sklearn.ensemble import RandomForestRegressor

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor())
])

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate (same as linear regression)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')
# Calculate train MAE
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f'Train MAE: {train_mae}')



MAE: 978.133352910053
Train MAE: 787.8099615368009


In [15]:


# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())
])

# Train the model (might take longer than Random Forest or Linear Regression)
model.fit(X_train, y_train)

# Predict and evaluate (same as linear regression)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')


MAE: 1076.483970811632


In [16]:
# Load and clean the training data
df_train = pd.read_csv('train.csv')

# Separate features and target variable
X = df_train.drop('Sales', axis=1)
y = df_train['Sales']

# Handle missing values in target variable (example using median imputation)
imputer = SimpleImputer(strategy='median')
y = imputer.fit_transform(y.values.reshape(-1, 1))[:, 0]


# Preprocessing steps
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])
# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')

# Convert scores to positive MAE
cv_scores = -cv_scores

# Print cross-validation scores
print("Cross-validation MAE scores:", cv_scores)
print("Average MAE:", cv_scores.mean())
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Merge with Economic data
X_train = pd.merge(X_train, indicators, left_on='Quarter', right_on='Quarter', how='left')
X_test = pd.merge(X_test, indicators, left_on='Quarter', right_on='Quarter', how='left')

# Train the model
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

y_train_pred = model.predict(X_train)

# Calculate train MAE
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f'Train MAE: {train_mae}')
# # Load the test data
df_test = pd.read_csv('test.csv')
df_test = pd.merge(df_test, indicators, left_on='Quarter', right_on='Quarter', how='left')

# Predict using the model
y_test_pred = model.predict(df_test)  # Ensure this line is executed after model training

# Create submission DataFrame
submission_df = pd.DataFrame({'ID': df_test.ID, 'Sales': y_test_pred})

# Save the submission file
submission_df.to_csv('final_submission.csv', index=False)
print('Submission file saved successfully.')


Cross-validation MAE scores: [2227.73183698 1903.14723167 1436.54961161 1772.84661596 1691.11174476]
Average MAE: 1806.2774081972907
MAE: 865.9460388855515
Train MAE: 787.8099615368009
Submission file saved successfully.


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define a dictionary to store model names and corresponding models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=0),
    "Random Forest": RandomForestRegressor(random_state=0),
    "Gradient Boosting": GradientBoostingRegressor(random_state=0)
}

# Perform cross-validation and evaluate each model
for name, model in models.items():
    # Define the model pipeline
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Perform cross-validation
    cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    cv_scores = -cv_scores  # Convert scores to positive MAE
    avg_mae = cv_scores.mean()  # Calculate average MAE
    
    # Print model name and average MAE
    print(f"{name}: Average MAE = {avg_mae}")


Linear Regression: Average MAE = 1806.2774081972907
Decision Tree: Average MAE = 1828.9451851851852
Random Forest: Average MAE = 1500.7008992239857
Gradient Boosting: Average MAE = 1439.3871452428798


In [20]:
## the submission which gave private MAE = 819

import pandas as pd
import numpy as np
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
# Reading datasets
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_indicators = pd.read_csv('EconomicIndicators.csv')
df_indicators.head()

Unnamed: 0,Month,Consumer Sentiment,Interest Rate,PMI,Money Supply,NationalEAI,EastEAI,WestEAI,SouthEAI,NorthEAI
0,1,67.2,1.5385,55.5,20847.8,57.083078,56.512247,54.628506,56.512247,57.083078
1,2,62.8,1.811579,57.3,20964.3,47.496553,45.454201,47.021588,45.454201,47.496553
2,3,59.4,2.10913,58.8,21115.6,41.697385,39.904398,42.656425,41.280411,41.697385
3,4,65.2,2.7775,59.2,21315.8,48.503429,46.417782,46.417782,48.018395,43.653086
4,5,58.4,2.874286,57.0,21549.3,41.535949,38.379217,39.749903,42.491276,49.843138


In [21]:
df_indicators['Quarter'] = df_indicators['Month'].apply(lambda x: "Q"+str((x - 1) // 3 + 1))

# Ensure that df_indicators has unique quarters by aggregating data
df_indicators = df_indicators.groupby('Quarter').mean().reset_index()

# Merge the datasets based only on 'Quarter', resolving overlapping column names
df_train = pd.merge(df_train, df_indicators, on='Quarter', how='left', suffixes=('', '_eco'))
df_test = pd.merge(df_test, df_indicators, on='Quarter', how='left', suffixes=('', '_eco'))

In [22]:
df_train.drop('ID', axis=1, inplace=True)
df_train.dropna(subset=['Sales'], inplace=True)
df_train.head()

Unnamed: 0,Company,Quarter,QuickRatio,InventoryRatio,RevenueGrowth,MarketshareChange,Bond rating,Stock rating,Region,Industry,...,Month,Consumer Sentiment,Interest Rate,PMI,Money Supply,NationalEAI,EastEAI,WestEAI,SouthEAI,NorthEAI
0,CMP01,Q1,2.02,7.71,0.05,-0.04,CCC,Buy,South,Metal Fabrication,...,2.0,63.133333,1.819736,57.2,20975.9,48.759005,47.290282,48.102173,47.748953,48.759005
1,CMP01,Q2,2.01,4.1,0.03,0.0,CCC,Hold,South,Metal Fabrication,...,5.0,57.866667,2.947262,56.3,21475.8,38.324704,36.768411,36.676742,38.124071,39.476987
2,CMP01,Q3,2.02,6.79,0.06,-0.02,CCC,Buy,South,Metal Fabrication,...,8.0,56.1,3.229186,51.9,21648.566667,36.429408,35.565655,36.163304,35.214299,39.926052
3,CMP01,Q4,1.98,3.97,0.01,0.02,CCC,Buy,South,Metal Fabrication,...,11.0,58.8,3.999262,48.1,21678.4,41.830051,40.472376,40.913394,42.286551,41.634372
4,CMP01,Q5,1.96,7.41,-0.07,0.02,CCC,Buy,South,Metal Fabrication,...,14.0,64.6,3.802861,47.8,21539.333333,51.604474,48.749299,50.452247,51.671275,48.218051


In [23]:
# Pipelines for numerical and categorical columns handling

numerical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [25]:
numerical_cols = ['QuickRatio', 'InventoryRatio', 'RevenueGrowth', 'MarketshareChange']
categorical_cols = ['Company', 'Quarter', 'Bond rating', 'Stock rating', 'Region', 'Industry']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipe, numerical_cols),
        ('cat', categorical_pipe, categorical_cols)
    ])
# The training data is split into training and validation sets to evaluate the model's performance

X = df_train[numerical_cols + categorical_cols]
y = df_train['Sales']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
## Lasso Regression

lasso_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('lasso_model', Lasso(alpha=1))])

lasso_model.fit(X_train, y_train)

y_train_pred = lasso_model.predict(X_train)
y_test_pred = lasso_model.predict(X_valid)

train_mae_lr = mean_absolute_error(y_train, y_train_pred)
valid_mae_lr = mean_absolute_error(y_valid, y_test_pred)

print(f'Training MAE: {train_mae_lr}')
print(f'Test MAE: {valid_mae_lr}')

Training MAE: 750.2569385747452
Test MAE: 784.9466555510543


In [27]:
## Ridge Regression

ridge_model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('ridge_model', Ridge(alpha=1))])

ridge_model.fit(X_train, y_train)

y_train_pred = ridge_model.predict(X_train)
y_test_pred = ridge_model.predict(X_valid)

train_mae_lr = mean_absolute_error(y_train, y_train_pred)
valid_mae_lr = mean_absolute_error(y_valid, y_test_pred)

print(f'Training MAE: {train_mae_lr}')
print(f'Test MAE: {valid_mae_lr}')

Training MAE: 778.1737599627397
Test MAE: 783.5582391033979


In [28]:
from sklearn.model_selection import GridSearchCV

# Define a range of alpha values for tuning the Ridge model
param_grid = {'ridge_model__alpha': np.logspace(-3, 3, 13)}

# Setup the grid search
ridge_grid_search = GridSearchCV(
    ridge_model,
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    verbose=1
)

# Fit the grid search to the data
ridge_grid_search.fit(X_train, y_train)

# Get the best model
best_ridge_model = ridge_grid_search.best_estimator_

# Predict and evaluate
y_valid_pred = best_ridge_model.predict(X_valid)
valid_mae = mean_absolute_error(y_valid, y_valid_pred)
print("Best Ridge Model Validation MAE:", valid_mae)


Fitting 5 folds for each of 13 candidates, totalling 65 fits
Best Ridge Model Validation MAE: 789.4972987305954
