<a href="https://colab.research.google.com/github/VIPlearner/csv-files/blob/main/Joshua.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing relevant libraries
!pip install xgboost
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, MinMaxScaler, Normalizer, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
#importing data from my github profile
df=pd.read_csv("https://raw.githubusercontent.com/VIPlearner/csv-files/main/Train.csv", index_col="VehicleID")


In [None]:
#dropping irrelevant features
df.drop(['Colour'], axis = 1, inplace = True)
#df.drop(['Maker'], axis = 1, inplace = True)
#df.drop(['Model'], axis = 1, inplace = True)
df.drop(['Distance'], axis = 1, inplace = True)


In [None]:
#converting 'Year' column to float
df['Year'] = df['Year'].str.replace(',', '').astype(float)
# df['Distance'] = df['Distance'].str.replace(',', '').astype(float)


In [None]:
#converting categorical columns to dtype category for the One Hot Encoder
df['Location'] = df.Location.astype('category')
df['Type'] = df.Type.astype('category')
df['Maker'] = df.Maker.astype('category')
df['Model'] = df.Model.astype('category')
# df['Colour'] = df.Colour.astype('category')


In [None]:
#dropping missing data
df = df[~(np.isnan(df["Amount (Million Naira)"]))]

#splitting data in train and test using train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Amount (Million Naira)'], axis = 1), df['Amount (Million Naira)'],
                                                    test_size = 0.3,
                                                   random_state = 42)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7188 entries, VHL12546 to VHL15246
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   Location                7188 non-null   category
 1   Maker                   7188 non-null   category
 2   Model                   7188 non-null   category
 3   Year                    7167 non-null   float64 
 4   Amount (Million Naira)  7188 non-null   float64 
 5   Type                    6992 non-null   category
dtypes: category(4), float64(2)
memory usage: 506.2+ KB


In [None]:
#numeric_features = ["Year", "Type"]
#defining pipeline steps for preprocessing each column
year_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("scaler", MinMaxScaler())]
)

distance_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", MinMaxScaler())]
)

type_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore"))]
)

categorical_features = ["Location", 'Maker', 'Model', ]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("year", year_transformer, ["Year"]),
        # ("distance", year_transformer, ["Distance"]),
        ("cat", categorical_transformer, categorical_features),
        ("type_cat", type_transformer, ["Type"]),  
    ]
)

In [None]:
# defining pipeline for the entire model
pipeline = Pipeline(
  steps=[("preprocessor", preprocessor), ("estimator", XGBRegressor())] 
)
# colsample_bytree = 1, learning_rate = 0.1, max_depth = 3, n_estimators = 3000

In [None]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'estimator', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__year', 'preprocessor__cat', 'preprocessor__type_cat', 'preprocessor__year__memory', 'preprocessor__year__steps', 'preprocessor__year__verbose', 'preprocessor__year__imputer', 'preprocessor__year__scaler', 'preprocessor__year__imputer__add_indicator', 'preprocessor__year__imputer__copy', 'preprocessor__year__imputer__fill_value', 'preprocessor__year__imputer__missing_values', 'preprocessor__year__imputer__strategy', 'preprocessor__year__imputer__verbose', 'preprocessor__year__scaler__clip', 'preprocessor__year__scaler__copy', 'preprocessor__year__scaler__feature_range', 'preprocessor__cat__categories', 'preprocessor__cat__drop', 'preprocessor__cat__dtype', 'preprocessor__cat__handle_unkn

In [None]:
#this parameter variable was defined for the GridSearchCV to determine the best parameters for the model 
# parameters = {#'estimator__alpha': [1],
#               #'estimator':[DecisionTreeRegressor()],
#               # 'estimator__n_estimators': [3000],
#               # 'estimator__max_features': ['auto']
# }

In [None]:
# grid = RandomizedSearchCV(pipeline, parameters, cv=2, n_iter=50)

pipeline.fit(df.drop(['Amount (Million Naira)'], axis = 1), df['Amount (Million Naira)'])   



Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('year',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Year']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Location', 'Maker',
                                                   'Model']),
                                                 ('type_cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImpu

In [None]:
# grid.best_params_

In [None]:
#measuring model performance
print("model score: %.3f" % pipeline.score(X_test, y_test))
y_pred = pipeline.predict(X_test)
errors = mean_squared_error(y_test, y_pred, squared = False)
print(errors)

model score: 0.790
10.027662190972178


In [None]:
# Repeating preprocessing steps for the test file
test_df=pd.read_csv("https://raw.githubusercontent.com/VIPlearner/csv-files/main/Test.csv")
test_df.head()
#dropping irrelevant features
# test_df.drop(['Colour'], axis = 1, inplace = True)
# test_df.drop(['Maker'], axis = 1, inplace = True)
# test_df.drop(['Model'], axis = 1, inplace = True)
test_df['Year'] = test_df['Year'].str.replace(',', '').astype(float)
#test_df['Distance'] = test_df['Distance'].str.replace(',', '').astype(float)
test_df['Location'] = test_df.Location.astype('category')
test_df['Type'] = test_df.Type.astype('category')
test_df['Maker'] = test_df.Maker.astype('category')
test_df['Model'] = test_df.Model.astype('category')
test_df['Colour'] = test_df.Colour.astype('category')

In [None]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061 entries, 0 to 2060
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   VehicleID  2061 non-null   object  
 1   Location   2061 non-null   category
 2   Maker      2061 non-null   category
 3   Model      2061 non-null   category
 4   Year       2059 non-null   float64 
 5   Colour     2061 non-null   category
 6   Type       2007 non-null   category
 7   Distance   1385 non-null   float64 
dtypes: category(5), float64(2), object(1)
memory usage: 83.6+ KB


In [None]:
test_y_pred = pipeline.predict(test_df)

In [None]:
test_pred_df = pd.DataFrame({'VehicleID' : test_df['VehicleID'], 'Amount (Million Naira)': test_y_pred})
test_pred_df.head()

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,4.676809
1,VHL17149,7.460948
2,VHL10927,4.425385
3,VHL12909,6.117086
4,VHL12348,8.025303


In [None]:
# converting dataframe to csv file
test_pred_df.to_csv('ninthtry.csv', encoding='utf-8', index=False)

In [None]:
# downloading csv file
from google.colab import files
files.download("ninthtry.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

''

In [None]:
# pd.read_excel()

In [None]:
fit = pd.read_csv('ninthtry.csv', index_col = 'VehicleID')
fit.reset_index()

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,4.676809
1,VHL17149,7.460948
2,VHL10927,4.425384
3,VHL12909,6.117086
4,VHL12348,8.025303
...,...,...
2056,VHL17903,26.745415
2057,VHL14018,6.246005
2058,VHL17473,10.226078
2059,VHL11480,8.025303


In [None]:
class FunEvent:
    def __init__(self, tags, year):
        self.tags = tags
        self.year = year
    
    def __str__(self):
        return f"FunEvent(tags={self.tags}, year={self.year})"

tags = ["google", "ml"]
year = 2022
bootcamp = FunEvent(tags, year)
tags.append("bootcamp")
year = 2023
print(bootcamp)

FunEvent(tags=['google', 'ml', 'bootcamp'], year=2022)


In [None]:
sqsum1()

NameError: ignored