In [None]:
# importing libraries
import pandas as pd
import numpy as np

In [None]:
# loading dataset
dataset = pd.read_csv('/kaggle/input/world-export-and-import-dataset/34_years_world_export_import_dataset.csv')
dataset.head()

In [None]:
# missing values in the dataset
missing_values = dataset.isnull().sum()
missing_values

In [None]:
missing_values[missing_values > 0]

In [None]:
dataset.shape

In [None]:
# total data in the dataset
total_dataset = np.product(dataset.shape)
total_dataset

In [None]:
# total missing values
total_missing = missing_values.sum()
total_missing

In [None]:
# total percentage of missing values
percent = (total_missing / total_dataset) * 100
percent

In [None]:
y = dataset['Export (US$ Thousand)']
y.head()


In [None]:
drop = dataset.drop(['Export (US$ Thousand)'], axis = 1)
X = drop.select_dtypes(exclude = 'object')
X.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 0.8, test_size= 0.2, random_state= 0)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

def score_data(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor(n_estimators=100, random_state = 0)
  model.fit(X_train, y_train)
  pred = model.predict(X_test)
  return mean_absolute_error(y_test, pred)

**Approach 1 just remove missing columns**

In [None]:
missing_with_columns = [col for col in X_train.columns if X_train[col].isnull().sum()]

drop_X_train = X_train.drop(missing_with_columns, axis = 1)
drop_X_test = X_test.drop(missing_with_columns, axis = 1)

print(score_data(drop_X_train, drop_X_test, y_train, y_test))


**Approach 2 Imputation**

In [None]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer()
imputed_X_train = pd.DataFrame(impute.fit_transform(X_train))
imputed_X_test = pd.DataFrame(impute.fit_transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_test.columns = X_test.columns

print(score_data(imputed_X_train, imputed_X_test, y_train, y_test))

**Approach 3 improvising imputation** *italicized text*

In [None]:
X_train_plus = X_train.copy()
X_test_plus = X_test.copy()

for col in missing_with_columns:
  X_train_plus[col + "_was_missing"] = X_train_plus[col].isnull()
  X_test_plus[col + "_was_missing"] = X_test_plus[col].isnull()

imputed_X_train_plus = pd.DataFrame(impute.fit_transform(X_train_plus))
imputed_X_test_plus = pd.DataFrame(impute.fit_transform(X_test_plus))

imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_test_plus.columns = X_test_plus.columns

print(score_data(imputed_X_train_plus, imputed_X_test_plus, y_train, y_test))