In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
import datetime as dt
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn import metrics

In [2]:
df = pd.read_csv("data/new/New7.csv", sep=';')
df = df.dropna()

df["Customer Country"] = pd.Categorical(df["Customer Country"])
df["Shipper Country"] = pd.Categorical(df["Shipper Country"])

df["Customer Country Codes"] = df["Customer Country"].cat.codes
df["Shipper Country Codes"] = df["Shipper Country"].cat.codes

df["Ship Date"] = pd.to_datetime(df["Ship Date"], format='%d-%m-%Y %H:%M')
df["Actual Del Date"] = pd.to_datetime(df["Actual Del Date"], format='%d-%m-%Y %H:%M', errors="coerce")

df["test"] = df["Actual Del Date"] - df["Ship Date"]
df = df.drop_duplicates()
df["Shipment Time"] = (df["test"]).dt.days
del df["test"]

In [3]:
indexNames = df[ df['Shipment Time'] < 1].index
df.drop(indexNames , inplace=True)

indexNames = df[ df['Shipper Country Codes'] < 1].index
df.drop(indexNames , inplace=True)

df = df.dropna()
df["Shipment Time"] = df["Shipment Time"].astype("int32")

indexNames = df[ df['Shipment Time'] > 365 ].index
df.drop(indexNames , inplace=True)

df = df.reset_index(drop=True)
df.to_csv(r"test.csv", index=False)

In [16]:
print(len(df["Customer Country Codes"].unique()))

71


In [8]:
x = df["Customer Country Codes"].to_numpy()
y = df["Shipment Time"].to_numpy()
xRes = x.reshape((-1,1))
yRes = y.reshape((-1,1))

x_train, x_test, y_train, y_test = train_test_split(xRes, yRes, test_size=0.3, random_state=101)

scaler = StandardScaler()
scaler.fit(x_train)

X_trainScaled = scaler.transform(x_train)
X_testScaled = scaler.transform(x_test)

accScores = pd.DataFrame(columns = ['model', 'score'])
model = LogisticRegression(multi_class='multinomial')
model.fit(X_trainScaled, y_train.ravel())
y_pred = model.predict(X_testScaled)
score = metrics.accuracy_score(y_test, y_pred)
pd.Series(["Logistic Regression", score], index=accScores.columns)

model    Logistic Regression
score                0.41573
dtype: object

In [9]:
pred = model.predict(np.array([[54]]))
print(pred)

[43]


In [6]:
# import numpy as np
# import matplotlib.pyplot as plt 
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.dummy import DummyClassifier

# from sklearn import metrics
# from sklearn.metrics import confusion_matrix

# from sklearn.naive_bayes import CategoricalNB
# from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import ComplementNB

# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import classification_report

# from sklearn.svm import SVC
# from sklearn.preprocessing import StandardScaler
# from mpl_toolkits import mplot3d
# from ipywidgets import interact, fixed
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import RandomForestClassifier

# import six
# import sys
# sys.modules['sklearn.externals.six'] = six
# from id3 import Id3Estimator
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.experimental import enable_hist_gradient_boosting
# from sklearn.ensemble import HistGradientBoostingClassifier
# from xgboost import XGBClassifier
# from sklearn.ensemble import StackingClassifier
# from sklearn.pipeline import make_pipeline
# from sklearn.svm import LinearSVC
# from sklearn.model_selection import GridSearchCV
# from sklearn import tree

# from sklearn import metrics
# xRes = x.reshape((-1,1))
# yRes = y.reshape((-1,1))

# x_train, x_test, y_train, y_test = train_test_split(xRes, yRes, test_size=0.4, random_state=101)

# # set the scaler
# scaler = StandardScaler()
# scaler.fit(x_train)

# # Convert the train and test X values, using the same scaler (so based on the X_train)
# X_trainScaled = scaler.transform(x_train)
# X_testScaled = scaler.transform(x_test)

# accScores = pd.DataFrame(columns = ['model', 'score'])

# # Create a list with all the different models (except polynomial svm).
# models =[["Dummy - Uniform", DummyClassifier(strategy="uniform")]]
# models.append(['LDA', LinearDiscriminantAnalysis()])
# # models.append(['QDA', QuadraticDiscriminantAnalysis()])
# models.append(['Logistic Regression', LogisticRegression()])
# models.append(['Multinomial LR', LogisticRegression(multi_class='multinomial')])
# models.append(['Boosting - AdaBoost', AdaBoostClassifier(random_state=0)])
# models.append(['Boosting - Gradient', GradientBoostingClassifier(random_state=0)])
# # models.append(['Boosting - Histogram Gradient', HistGradientBoostingClassifier(random_state=0)])
# # models.append(['Boosting - XGboost', XGBClassifier()])
# estimators = [
#     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
#     ('svr', make_pipeline(StandardScaler(),
#                           LinearSVC(random_state=42)))]
# models.append(['Stacking', StackingClassifier(estimators=estimators, final_estimator = LogisticRegression())])

# # Now to iterate over all of them:
# for i in models:
#     print(i[1])
#     model = i[1]
#     model.fit(X_trainScaled, y_train)
#     y_pred = model.predict(X_testScaled)
#     score=metrics.accuracy_score(y_test, y_pred)
#     newRow=pd.Series([i[0], score], index=accScores.columns)
#     accScores=accScores.append(newRow, ignore_index=True)
    
# # And show all results sorted by their score:
# accScores.sort_values(by=['score'])