In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Dataset imports

In [19]:
cpi_df=pd.read_csv("dataset/CPI-Rural,Urban,Combined.csv")

In [20]:
cpi_df_copy = cpi_df.copy()

In [21]:
cpi_df.head(10)

Unnamed: 0,Month,Commodity Description,Provisional / Final,Rural(Current Month),Rural(Inflation Y-o-Y),Urban(Current Month),Urban(Inflation Y-o-Y),Combined(Current Month),Combined(Inflation Y-o-Y)
0,Oct-23,A) General Index,Provisional,187.0,,183.4,,185.3,
1,Oct-23,A.1) Food and beverages,Provisional,188.4,,194.9,,190.8,
2,Oct-23,A.1.1) Cereals and products,Provisional,182.9,,182.7,,182.8,
3,Oct-23,A.1.2) Meat and fish,Provisional,215.1,,222.9,,217.8,
4,Oct-23,A.1.3) Egg,Provisional,185.3,,189.3,,186.8,
5,Oct-23,A.1.4) Milk and products,Provisional,181.7,,182.2,,181.9,
6,Oct-23,A.1.5) Oils and fats,Provisional,163.1,,157.0,,160.9,
7,Oct-23,A.1.6) Fruits,Provisional,174.9,,183.8,,179.1,
8,Oct-23,A.1.7) Vegetables,Provisional,190.3,,234.0,,205.1,
9,Oct-23,A.1.8) Pulses and products,Provisional,199.7,,205.7,,201.7,


In [22]:
# Data Preprocessing
cpi_df['Year'] = '20' + cpi_df['Month'].str[-2:]  # Extract last 2 digits and append '20' to form the year
cpi_df['Month'] = cpi_df['Month'].str[:-3]  # Remove last 3 characters to get the month
cpi_df.tail()

Unnamed: 0,Month,Commodity Description,Provisional / Final,Rural(Current Month),Rural(Inflation Y-o-Y),Urban(Current Month),Urban(Inflation Y-o-Y),Combined(Current Month),Combined(Inflation Y-o-Y),Year
6508,Jan,A.6.2) Health,Final,104.0,,104.1,,104.0,,2013
6509,Jan,A.6.3) Transport and communication,Final,103.3,,103.2,,103.2,,2013
6510,Jan,A.6.4) Recreation and amusement,Final,103.4,,102.9,,103.1,,2013
6511,Jan,A.6.5) Education,Final,103.8,,103.5,,103.6,,2013
6512,Jan,A.6.6) Personal Care and Effects,Final,104.7,,104.3,,104.5,,2013


In [23]:
# Create a mapping of month names to numbers
month_mapping = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}

# Apply the mapping to the 'Month' column
cpi_df['Month'] = cpi_df['Month'].map(month_mapping)

cpi_df.replace('', pd.NA, inplace=True)
cpi_df.replace(' ', pd.NA, inplace=True)
cpi_df.fillna(0, inplace=True)
# Convert numeric columns to float
numeric_columns = cpi_df.select_dtypes(include=['number']).columns
cpi_df[numeric_columns] = cpi_df[numeric_columns].astype(float)
# Model Selection and Training
target_columns = ['Rural(Current Month)','Urban(Current Month)']
# Exclude target columns from features
features = cpi_df.drop(columns=target_columns).columns.tolist()

# Specify the target columns
target_rural = 'Rural(Current Month)'
target_urban = 'Urban(Current Month)'

In [24]:
cpi_df.head()

Unnamed: 0,Month,Commodity Description,Provisional / Final,Rural(Current Month),Rural(Inflation Y-o-Y),Urban(Current Month),Urban(Inflation Y-o-Y),Combined(Current Month),Combined(Inflation Y-o-Y),Year
0,10.0,A) General Index,Provisional,187.0,0.0,183.4,0.0,185.3,0.0,2023
1,10.0,A.1) Food and beverages,Provisional,188.4,0.0,194.9,0.0,190.8,0.0,2023
2,10.0,A.1.1) Cereals and products,Provisional,182.9,0.0,182.7,0.0,182.8,0.0,2023
3,10.0,A.1.2) Meat and fish,Provisional,215.1,0.0,222.9,0.0,217.8,0.0,2023
4,10.0,A.1.3) Egg,Provisional,185.3,0.0,189.3,0.0,186.8,0.0,2023


In [25]:
# One-Hot Encoding
cpi_df = pd.get_dummies(cpi_df, columns=['Commodity Description','Provisional / Final'], drop_first=True)

In [26]:
features = features[:1] + features[3:]

## Train test split

In [27]:
# Data Splitting
train, test = train_test_split(cpi_df, test_size=0.2, random_state=42)

## Model Selection

In [28]:
# Linear Regression
model_lr_rural = LinearRegression()
model_lr_urban = LinearRegression()

# Random Forest
model_rf_rural = RandomForestRegressor()
model_rf_urban = RandomForestRegressor()

# Gradient Boosting
model_gb_rural = GradientBoostingRegressor()
model_gb_urban = GradientBoostingRegressor()

# Support Vector Machine
model_svm_rural = make_pipeline(StandardScaler(), SVR())
model_svm_urban = make_pipeline(StandardScaler(), SVR())

models = [
    ('Linear Regression Rural', model_lr_rural),
    ('Linear Regression Urban', model_lr_urban),
    ('Random Forest Rural', model_rf_rural),
    ('Random Forest Urban', model_rf_urban),
    ('Gradient Boosting Rural', model_gb_rural),
    ('Gradient Boosting Urban', model_gb_urban),
    ('SVM Rural', model_svm_rural),
    ('SVM Urban', model_svm_urban),
]

## Training

In [29]:
best_predictions_rural = None
best_predictions_urban = None
best_model_name_rural = None
best_model_name_urban = None
best_mse_rural = float('inf')
best_mse_urban = float('inf')

for model_name, model in models:
    # Train the model for rural inflation
    model.fit(train[features], train[target_rural])
    predictions_rural = model.predict(test[features])

    # Evaluate model for rural inflation
    mse_rural = mean_squared_error(test[target_rural], predictions_rural)

    # Train the model for urban inflation
    model.fit(train[features], train[target_urban])
    predictions_urban = model.predict(test[features])

    # Evaluate model for urban inflation
    mse_urban = mean_squared_error(test[target_urban], predictions_urban)

    # Print MSE for each model
    print(f'Mean Squared Error ({model_name} - Rural): {mse_rural}')
    print(f'Mean Squared Error ({model_name} - Urban): {mse_urban}')
    print('---')

    # Update best predictions if the current model has a lower MSE
    if mse_rural < best_mse_rural:
        best_mse_rural = mse_rural
        best_model_name_rural = model_name
        best_predictions_rural = predictions_rural

    if mse_urban < best_mse_urban:
        best_mse_urban = mse_urban
        best_model_name_urban = model_name
        best_predictions_urban = predictions_urban

# Save the best predictions to a CSV file
best_predictions_df = pd.DataFrame({
    'Best Predicted Rural(base year 2013)': best_predictions_rural,
    'Best Predicted Urban(base year 2013)': best_predictions_urban
})
best_predictions_df.to_csv('new_predictions.csv', index=False)

# Print information about the best models
print(f'Best Model for Rural: {best_model_name_rural}, Best MSE: {best_mse_rural}')
print(f'Best Model for Urban: {best_model_name_urban}, Best MSE: {best_mse_urban}')

Mean Squared Error (Linear Regression Rural - Rural): 601.0200635764097
Mean Squared Error (Linear Regression Rural - Urban): 21.877001197433934
---
Mean Squared Error (Linear Regression Urban - Rural): 601.0200635764097
Mean Squared Error (Linear Regression Urban - Urban): 21.877001197433934
---
Mean Squared Error (Random Forest Rural - Rural): 84.12556011438495
Mean Squared Error (Random Forest Rural - Urban): 12.33886490732814
---
Mean Squared Error (Random Forest Urban - Rural): 75.7714775149679
Mean Squared Error (Random Forest Urban - Urban): 12.368276293786188
---
Mean Squared Error (Gradient Boosting Rural - Rural): 145.61040292239937
Mean Squared Error (Gradient Boosting Rural - Urban): 20.11368942837942
---
Mean Squared Error (Gradient Boosting Urban - Rural): 146.92489164424427
Mean Squared Error (Gradient Boosting Urban - Urban): 20.03154595055475
---
Mean Squared Error (SVM Rural - Rural): 736.9901688864663
Mean Squared Error (SVM Rural - Urban): 103.7327441142593
---
Mean

In [30]:
merged_df=pd.concat([cpi_df_copy,best_predictions_df],axis=1)
merged_df.head()

Unnamed: 0,Month,Commodity Description,Provisional / Final,Rural(Current Month),Rural(Inflation Y-o-Y),Urban(Current Month),Urban(Inflation Y-o-Y),Combined(Current Month),Combined(Inflation Y-o-Y),Best Predicted Rural(base year 2013),Best Predicted Urban(base year 2013)
0,Oct-23,A) General Index,Provisional,187.0,,183.4,,185.3,,116.572,114.011
1,Oct-23,A.1) Food and beverages,Provisional,188.4,,194.9,,190.8,,173.556,175.954
2,Oct-23,A.1.1) Cereals and products,Provisional,182.9,,182.7,,182.8,,164.906,157.559
3,Oct-23,A.1.2) Meat and fish,Provisional,215.1,,222.9,,217.8,,187.7795,207.116
4,Oct-23,A.1.3) Egg,Provisional,185.3,,189.3,,186.8,,107.287,109.404


In [31]:
merged_df.to_csv('predictions/predictions_visual_analytics.csv', index=False)

## Feedback loop

In [32]:
base_year_now=2013

In [33]:
for i in range(1,2):#here loop is iterated once as we want only base-2014 data. Otherwise, it is working fine for every testcase
    feedback_df=pd.read_csv("predictions/predictions_visual_analytics.csv")
    feedback_df_copy = feedback_df.copy()
    feedback_df['Year'] = '20' + feedback_df['Month'].str[-2:] 
    feedback_df['Month'] = feedback_df['Month'].str[:-3]  
    month_mapping = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    feedback_df['Month'] = feedback_df['Month'].map(month_mapping)
    feedback_df.replace('', pd.NA, inplace=True)
    feedback_df.replace(' ', pd.NA, inplace=True)
    feedback_df.fillna(0, inplace=True)
    numeric_columns = feedback_df.select_dtypes(include=['number']).columns
    feedback_df[numeric_columns] = feedback_df[numeric_columns].astype(float)
    target_columns = ['Rural(Current Month)','Urban(Current Month)']
    features = feedback_df.drop(columns=target_columns).columns.tolist()
    target_rural = 'Rural(Current Month)'
    target_urban = 'Urban(Current Month)'
    feedback_df = pd.get_dummies(feedback_df, columns=['Commodity Description','Provisional / Final'], drop_first=True)
    features = features[:1] + features[3:]
    train, test = train_test_split(feedback_df, test_size=0.2, random_state=42)
    model_lr_rural = LinearRegression()
    model_lr_urban = LinearRegression()
    model_rf_rural = RandomForestRegressor()
    model_rf_urban = RandomForestRegressor()
    model_gb_rural = GradientBoostingRegressor()
    model_gb_urban = GradientBoostingRegressor()
    model_svm_rural = make_pipeline(StandardScaler(), SVR())
    model_svm_urban = make_pipeline(StandardScaler(), SVR())
    models = [
        ('Linear Regression Rural', model_lr_rural),
        ('Linear Regression Urban', model_lr_urban),
        ('Random Forest Rural', model_rf_rural),
        ('Random Forest Urban', model_rf_urban),
        ('Gradient Boosting Rural', model_gb_rural),
        ('Gradient Boosting Urban', model_gb_urban),
        ('SVM Rural', model_svm_rural),
        ('SVM Urban', model_svm_urban),
    ]
    best_predictions_rural = None
    best_predictions_urban = None
    best_model_name_rural = None
    best_model_name_urban = None
    best_mse_rural = float('inf')
    best_mse_urban = float('inf')
    for model_name, model in models:
        model.fit(train[features], train[target_rural])
        predictions_rural = model.predict(test[features])
        mse_rural = mean_squared_error(test[target_rural], predictions_rural)
        model.fit(train[features], train[target_urban])
        predictions_urban = model.predict(test[features])
        mse_urban = mean_squared_error(test[target_urban], predictions_urban)
        print(f'Mean Squared Error ({model_name} - Rural): {mse_rural}')
        print(f'Mean Squared Error ({model_name} - Urban): {mse_urban}')
        print('---')
        if mse_rural < best_mse_rural:
            best_mse_rural = mse_rural
            best_model_name_rural = model_name
            best_predictions_rural = predictions_rural

        if mse_urban < best_mse_urban:
            best_mse_urban = mse_urban
            best_model_name_urban = model_name
            best_predictions_urban = predictions_urban
    current_base_year=base_year_now+i
    best_predictions_df = pd.DataFrame({
        'Predicted Rural CPI(base year '+str(current_base_year)+')': best_predictions_rural,
        'Predicted Urban CPI(base year '+str(current_base_year)+')': best_predictions_urban
    })
    merged_df=pd.concat([feedback_df_copy,best_predictions_df],axis=1)
    merged_df.to_csv('predictions/predictions_visual_analytics.csv', index=False)

Mean Squared Error (Linear Regression Rural - Rural): 599.9095154409252
Mean Squared Error (Linear Regression Rural - Urban): 21.802275134880414
---
Mean Squared Error (Linear Regression Urban - Rural): 599.9095154409252
Mean Squared Error (Linear Regression Urban - Urban): 21.802275134880414
---
Mean Squared Error (Random Forest Rural - Rural): 119.71640378514718
Mean Squared Error (Random Forest Rural - Urban): 12.702563516222684
---
Mean Squared Error (Random Forest Urban - Rural): 127.87366279867811
Mean Squared Error (Random Forest Urban - Urban): 12.3145684877162
---
Mean Squared Error (Gradient Boosting Rural - Rural): 146.76943497547273
Mean Squared Error (Gradient Boosting Rural - Urban): 20.417089918168703
---
Mean Squared Error (Gradient Boosting Urban - Rural): 146.7486225303417
Mean Squared Error (Gradient Boosting Urban - Urban): 20.560464782112835
---
Mean Squared Error (SVM Rural - Rural): 742.5641518061843
Mean Squared Error (SVM Rural - Urban): 102.42856180892427
---
