In [None]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df = pd.read_csv('countries of the world.csv')

In [None]:
columns_to_convert = [
    'Population', 'Area (sq. mi.)', 'Pop. Density (per sq. mi.)', 'Coastline (coast/area ratio)', 
    'Net migration', 'GDP ($ per capita)', 'Literacy (%)', 'Phones (per 1000)', 
    'Arable (%)', 'Crops (%)', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 
    'Industry', 'Service'
]

# Replace commas with dots and convert to numeric
for column in columns_to_convert:
    df[column] = df[column].astype(str).str.replace(',', '')
    df[column] = pd.to_numeric(df[column], errors='coerce')
    df[column].fillna(df[column].mean(), inplace=True)

In [None]:
#UnivariateAnalysis
plt.figure(figsize=(10,6))
sns.countplot(df['Region'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df_n = df.select_dtypes (include=['int','float']) 
features = list(df_n.columns)
"""Out of 18 features 15 features have outliers"""
for i,j in enumerate(features):
    plt.figure(figsize=(20,50)) 
    plt.subplot(9,2,i+1) 
    sns.boxplot(df_n[j])
    plt.show()

In [None]:
#Multivariate Analysis
plt.figure(figsize=(15,15)) 
sns.heatmap(df.corr().abs(),annot=True)

In [None]:
df.drop(['Other (%)', 'Infant mortality (per 1000 births)', 'Country', 'Region'], axis=1, inplace=True)

In [None]:
#Descriptive Analysis
df.describe(include='all')

In [None]:
# z_scores = np.abs(stats.zscore(df[columns_to_convert]))
# df = df[(z_scores < 3).all(axis=1)]

# Reapply transformations
# df['Population'] = np.log(df['Population'])
# df['Area (sq. mi.)'] = np.log(df['Area (sq. mi.)'])
# df['Pop. Density (per sq. mi.)'] = np.log1p(df['Pop. Density (per sq. mi.)']) 
# df['Coastline (coast/area ratio)'] = np.log1p(df['Coastline (coast/area ratio)'])
# df['Net migration'] = np.sqrt(df['Net migration'])
# df['GDP ($ per capita)'] = np.log(df['GDP ($ per capita)'])
# df['Phones (per 1000)'] = np.sqrt(df['Phones (per 1000)'])
# df['Arable (%)'] = np.sqrt(df['Arable (%)'])
# df['Crops (%)'] = np.log1p(df['Crops (%)']) 
# df['Deathrate'] = np.log(df['Deathrate'])
# df['Agriculture'] = np.sqrt(df['Agriculture'])
# df['Industry'] = np.sqrt(df['Industry'])


In [None]:
df.drop(['Literacy (%)','Net migration','Population','Area (sq. mi.)','Coastline (coast/area ratio)'], axis=1,inplace=True)

In [None]:
x = df.drop('GDP ($ per capita)', axis=1)
y = df['GDP ($ per capita)']


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10)

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

In [None]:
rf = RandomForestRegressor()
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [None]:
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(x_train, y_train)
best_rf = grid_search_rf.best_estimator_

In [None]:
# Display the best parameters for the Random Forest model
print("Best parameters for Random Forest:", grid_search_rf.best_params_)


In [None]:
# Hyperparameter tuning for SVR
svr = SVR()

param_grid_svr = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

In [50]:
grid_search_svr = GridSearchCV(estimator=svr, param_grid=param_grid_svr, cv=5, n_jobs=-1, verbose=2)
grid_search_svr.fit(x_train, y_train)
best_svr = grid_search_svr.best_estimator_

In [None]:
print("Best parameters for SVR:", grid_search_svr.best_params_)

In [None]:
# Predictions
y_pred_rf = best_rf.predict(x_test)
y_pred_svr = best_svr.predict(x_test)


In [None]:

# Performance metrics
metrics = {
    'Model': ['Linear Regression', 'Random Forest', 'SVR'],
    'R-squared': [
        r2_score(y_test, y_pred_lr),
        r2_score(y_test, y_pred_rf),
        r2_score(y_test, y_pred_svr)
    ],
    'MAE': [
        mean_absolute_error(y_test, y_pred_lr),
        mean_absolute_error(y_test, y_pred_rf),
        mean_absolute_error(y_test, y_pred_svr)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_test, y_pred_lr)),
        np.sqrt(mean_squared_error(y_test, y_pred_rf)),
        np.sqrt(mean_squared_error(y_test, y_pred_svr))
    ]
}


In [None]:

metrics_df = pd.DataFrame(metrics)


In [None]:
pickle.dump(best_rf, open('best_model.pkl', 'wb'))

In [None]:
metrics_df