In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings

warnings.filterwarnings('ignore')

In [48]:
data = pd.read_csv('Fractal_ExportData_eda.csv')
data = data.drop('Unnamed: 0',axis=1)
data

Unnamed: 0,QTY,UNIT_VALUE_USD,DRAWBACK,FOREIGN_COUNTRY,Categories,FOREIGN PORT CONTINENT
0,-0.802040,3,1.341641,0.137036,-0.187192,-1.148426
1,-0.792471,3,1.341641,0.137036,-0.187192,-1.148426
2,-0.804389,3,0.447214,0.137036,-0.187192,-1.148426
3,-0.460945,3,1.341641,1.775013,-0.847298,-1.148426
4,-0.253470,3,1.341641,1.775013,-0.847298,-1.148426
...,...,...,...,...,...,...
12175,-0.259472,0,-1.341641,-1.500941,0.472913,-0.304311
12176,-0.350727,2,1.341641,0.137036,-0.187192,0.539804
12177,-0.187878,2,1.341641,0.137036,-0.187192,0.539804
12178,-0.253035,2,1.341641,0.137036,-0.187192,0.539804


In [3]:
# no linear regression

In [4]:
# we can use decision tree as the data is label encoded
# naive bayes assumes independence and all other are correlated 
# svm can work but will not work in this case as i have tested it 
# knn also didn't work because of non encoded values and gave 59 percent values

In [5]:
from sklearn.tree import DecisionTreeClassifier

In [6]:
model_dt = DecisionTreeClassifier()

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = data.drop('UNIT_VALUE_USD',axis=1)
y = data['UNIT_VALUE_USD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
model_dt.fit(X_train,y_train)

In [10]:
from sklearn.metrics import accuracy_score

In [11]:
y_pred = model_dt.predict(X_test)

In [12]:
accuracy_score(y_test,y_pred)

0.7609452736318408

In [13]:
#finding best setting for decision tree 

In [14]:
from sklearn.model_selection import RandomizedSearchCV
classifier = RandomizedSearchCV((model_dt),{
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6, 10],
    'max_features': [None, 'sqrt', 'log2']
},cv=5,return_train_score=False,n_iter=1000) 
classifier.fit(X.head(1000),y.head(1000))

In [15]:
results = pd.DataFrame(classifier.cv_results_)

In [16]:
results.sort_values(by=['rank_test_score']).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
430,0.004937,0.000389,0.002745,0.000242,10,4,,5,entropy,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.755,0.63,0.77,0.785,0.605,0.709,0.075723,1
434,0.003796,0.000209,0.001935,0.00022,10,6,,5,entropy,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.745,0.63,0.77,0.8,0.6,0.709,0.079272,1
428,0.004895,0.000683,0.0025,0.000284,2,4,,5,entropy,"{'min_samples_split': 2, 'min_samples_leaf': 4...",0.755,0.63,0.77,0.785,0.605,0.709,0.075723,1
422,0.003763,9.2e-05,0.002083,0.000169,10,1,,5,entropy,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.755,0.635,0.76,0.785,0.6,0.707,0.074606,4
421,0.003806,8.1e-05,0.002023,0.00015,5,1,,5,entropy,"{'min_samples_split': 5, 'min_samples_leaf': 1...",0.755,0.635,0.76,0.785,0.6,0.707,0.074606,4


In [17]:
best_dt = classifier.best_estimator_ 

In [18]:
# best_dt is the best decision tree model we got , we can use it in stacking 

In [19]:
# here we should not use random forest as it will desharpen our accuracy, we should use it when we are getting overfitting results
# here we should use xgboost for boost in performance

In [20]:
from xgboost import XGBClassifier

In [21]:
model_xgb = XGBClassifier()

In [22]:
model_xgb.fit(X_train,y_train)

In [23]:
y_pred_xgb = model_xgb.predict(X_test)

In [24]:
accuracy_score(y_test,y_pred_xgb)

0.78681592039801

In [25]:
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
model_gb = GradientBoostingClassifier()

In [26]:
model_gb.fit(X_train,y_train)

In [27]:
y_pred_gb = model_gb.predict(X_test)

In [28]:
accuracy_score(y_test,y_pred_gb)

0.7711442786069652

In [29]:
# sometimes gGradientBoostingClassifier gets more aaccuracy_score that xgb so i tested it too,
# so now we can use

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
model_rf = RandomForestClassifier()

In [32]:
model_rf.fit(X_train,y_train)

In [33]:
y_pred_rf = model_rf.predict(X_test)

In [34]:
accuracy_score(y_test,y_pred_rf)

0.7721393034825871

In [35]:
# so i was wrong random forest can get more accuracy than dt 

classifier = RandomizedSearchCV((model_rf),{
    'n_estimators': [100, 200, 500],
    'criterion': ['gini', 'entropy']
},cv=5,return_train_score=False,n_iter=2000) 
classifier.fit(X,y)

results = pd.DataFrame(classifier.cv_results_)

results.sort_values(by=['rank_test_score'])

In [36]:
# got nothing from here

In [49]:
# so we got randomforest = 77.2% , XGB = 78.6% ,best_dt = 78.5%

In [38]:
from sklearn.ensemble import StackingClassifier

In [39]:
base_learners = [
    ('decision_tree' ,best_dt), 
    ('random_forest',RandomForestClassifier()), 
    ('Xgboost',XGBClassifier())
]

In [40]:
from sklearn.linear_model import LogisticRegression
meta_learner = LogisticRegression(max_iter=1000)

In [41]:
Stacking_clf = StackingClassifier(
    estimators= base_learners, 
    final_estimator= meta_learner, 
    cv=5
)

In [42]:
Stacking_clf.fit(X_train, y_train)

In [43]:
y_pred_stack = Stacking_clf.predict(X_test)

In [44]:
accuracy_score(y_test,y_pred_stack)

0.7858208955223881

In [45]:
# stacking = 78.5

Categories
'Glazed Vitrified Tiles': 1
'Glazed Porcelain Tiles': 2 
'Others Commodities': 3 
'Ceramic Wall Tiles': 4 
'Ceramic Floor Tiles': 5 
'Polished Glazed Vitrified Tiles': 6


tier1_country = ["United States","United Kingdom","Canada","Australia","France","Germany","Italy","Spain","Netherlands","Belgium","Sweden","Japan","Singapore","Ireland","Portugal","Austria","Switzerland","New Zealand"]
tier2_country = ["Russia","Poland","United Arab Emirates","Thailand","Greece","Croatia","Romania","Oman","Albania","Kuwait","Taiwan","Qatar","Saudi Arabia","South Africa","Israel","Nepal","Colombia","Lithuania","Bulgaria","Venezuela","Morocco","Kenya","Ecuador","China","Cyprus","Kazakhstan","Jamaica","Peru","Mauritius","Maldives","Chile","Slovak Republic","Serbia","Sri Lanka","Jordan","Portugal","Angola","Trinidad & Tobago","Slovenia","Nigeria","Guyana","Honduras","Ghana","Netherlands Antilles","Iran","Brazil","Bangladesh","Costa Rica","Malta","Nicaragua","Montenegro","Bahrain","Latvia","Uzbekistan","Ukraine","Georgia","Turkey","Czech Republic","Egypt","Philippines","Bosnia & Herzegovina","Hungary","Lebanon","Korea, Republic Of","Uruguay","Algeria","Armenia","Pakistan","Cambodia","Cameroon","Vietnam, Democratic Rep. Of","Mexico","Dominican Repulic","Indonesia","Gautemala","Moldova, Republic Of","Belize","Palestine State","Fiji","Brunei"]
tier3_country = ["Somalia","Senegal","Djibouti","Madagascar","Tanzania","Azarbaijan","Reunion","Sierra Leona","Gambia","El Salvador","Ethiopia","Yemen, Democratic","Mozambique","Benin","Liberia","Macedonia,The Former Yugoslav Republic Of","Cape Verde Islands","Mauritania","Congo, The Democratic Republic Of The","Guinea","Bhutan","Cote D Ivoire","Togo","French Guyana","Tajikistan","Guinea Bissau","Surinam","Congo","Syria","Kyrghystan","Burkina Faso","Mali","French Polynesia","Martinique","Gabon","Comoros","Afghanistan","Turkmenistan","Malawi","Seychelles","Barbados"]


FOREIGN_COUNTRY
tier1_country:1
tier2_country:2
tier3_country:3

DRAWBACK
(-0.001, 2818.75]     0 
(2818.75, 5432.5]     1 
(5432.5, 11099.75]    2 
(11099.75, 314727.0]  3


FOREIGN PORT CONTINENT
'asia':1 
'europe':2 
'north_america':3
'africa':4 
'south_america':5
'oceania':6


UNIT_VALUE_USD
(-0.001, 0.23]    0
(0.23, 0.367]     1
(0.367, 4.76]     2
(4.76, 499.744]   3

In [46]:
# above is how we are getting our columns

In [47]:
#this model went from 11 percent accuracy in linear reg to 82 percent

In [51]:
# so we got randomforest = 77.2% , XGB = 78.6% ,best_dt = 78.5% ,stacking = 78.5%

In [53]:
# so we are taking XGB = 78.6%

In [54]:
data.columns

Index(['QTY', 'UNIT_VALUE_USD', 'DRAWBACK', 'FOREIGN_COUNTRY', 'Categories',
       'FOREIGN PORT CONTINENT'],
      dtype='object')

In [55]:
import joblib 
joblib.dump(model_xgb,'XGB_Fractal_ExportData.pkl')
joblib.dump(X.columns.tolist(),'columns.pkl')

['columns.pkl']