In [2]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

import pandas as pd
pd.options.display.max_columns = 100

from matplotlib import pyplot as plt
import numpy as np

import seaborn as sns
sns.set(rc={'figure.figsize':(12,9)})
import pylab as plot
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

import statsmodels.formula.api as sm

In [3]:
from xgboost import XGBClassifier
import xgboost as xgb

In [4]:
import category_encoders as ce


In [5]:
#for scaling
from sklearn.preprocessing import StandardScaler

Importing training dataset

In [6]:
data = pd.read_csv('./train.csv')

In [7]:
print(data.shape)

(867873, 10)


In [8]:
data.columns

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y', 'Id'],
      dtype='object')

Splitting the "Dates" into date and time

In [9]:
df=data.head(100000)
datetime=df.Dates.str.split(pat=" ",expand=True)
datetime.columns=['Date','Time']
#datetime

In [10]:
Date=datetime.Date.str.split(pat="-",expand=True)
Date.columns=['Year','Month','Day']

Time=datetime.Time.str.split(pat=":",expand=True)
Time.columns=['Hour','Minute','Second']

Dropping the column of "Dates" and appending the dataframe "datetime"

In [11]:
df=pd.concat([df,Date,Time],axis=1)
#df

In [12]:
df=df.drop(labels=['Dates'],axis=1)

# #Encoding

In [13]:
df.columns

Index(['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution',
       'Address', 'X', 'Y', 'Id', 'Year', 'Month', 'Day', 'Hour', 'Minute',
       'Second'],
      dtype='object')

LabelEncoding of "DayOfWeek" and "PdDistrict"

In [14]:
le = preprocessing.LabelEncoder()

In [15]:
#y=pd.get_dummies(df.Category,columns=['Category'],prefix=" ",prefix_sep=" ",drop_first=True,)

le_res=le.fit_transform(df['Category'])
y=pd.DataFrame(le_res)
y.columns=['Category']
#y

# Rotation 45 degree

In [16]:

df["X"]=0.0174533 * df["X"] 
df["Y"]=0.0174533* df["Y"] 

In [17]:
df['Y']=df['Y'].apply(lambda x : x if 37.82 > x else 37.82)
df['X']=df['X'].apply(lambda x : x if -122.3 > x else 122.3)

In [18]:
df["rot45_X"]=0.707 * df["Y"] + 0.707 * df["X"]
df["rot45_Y"]=0.707 * df["Y"] - 0.707 * df["X"]

In [19]:
df["radial45"]=np.sqrt(np.power(df['rot45_X'],2) + np.power(df['rot45_Y'],2))

# Rotation 30 degree

In [20]:
df["rot30_X"]=(1.732/2) * df["Y"] + 0.5 * df["X"]
df["rot30_Y"]=(1.732/2) * df["Y"] - 0.5 * df["X"]


In [21]:
df["radial30"]=np.sqrt(np.power(df['rot30_X'],2) + np.power(df['rot30_Y'],2))

# Rotation 60 degress

In [22]:
df["rot60_X"]=(0.5) * df["Y"] + (1.732/2) * df["X"]
df["rot60_Y"]=0.5 * df["Y"] - (1.732/2) * df["X"]


In [23]:
df["radial60"]=np.sqrt(np.power(df['rot60_X'],2) + np.power(df['rot60_Y'],2))

# Hours clubbing

In [24]:
df['Hour']=df['Hour'].apply(lambda x:int(x))
#df['Hour']=df['Hour'].apply(lambda x : 'EARLY_Morning' if (x >= 1) and (x <=5) else ('MORNING' if x >= 6 and x <=11 else ('DAY' if x>=12 and x<=18 else 'Night')))
df['Hour']=df['Hour'].apply(lambda x : 'EARLY_MORNING' if x in (1,2,3,4,5) else ('MORNING' if x in (6,7,8,9,10,11) else ('DAY' if x in (12,13,14,15,16,17,18) else 'NIGHT')))

# Month Clubbing

In [25]:
df['Month']=df['Month'].apply(lambda x:int(x))
df['Month']=df['Month'].apply(lambda x : 'MonthLow' if x== 12 else ('MonthMed' if x in (2,6,7,8,9,11) else 'MonthHigh'))

# Minute Clubbing 

In [26]:
df['Minute']=df['Minute'].apply(lambda x:int(x))
df['Minute']=df['Minute'].apply(lambda x : 'low' if x <31 else 'high')

In [27]:
df.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y,Id,Year,Month,Day,Hour,Minute,Second,rot45_X,rot45_Y,radial45,rot30_X,rot30_Y,radial30,rot60_X,rot60_Y,radial60
0,SEX OFFENSES FORCIBLE,"FORCIBLE RAPE, BODILY FORCE",Friday,MISSION,NONE,2100 Block of MISSION ST,122.3,0.659076,141546,2013,MonthMed,28,DAY,high,0,86.932067,-86.000133,122.283307,61.72076,-60.57924,86.482926,106.241338,-105.582262,149.782629
1,LIQUOR LAWS,CONSUMING ALCOHOL IN PUBLIC VIEW,Thursday,SOUTHERN,"ARREST, BOOKED",1000 Block of MARKET ST,122.3,0.659408,794152,2004,MonthMed,19,EARLY_MORNING,high,0,86.932301,-85.999899,122.283309,61.721047,-60.578953,86.48293,106.241504,-105.582096,149.78263
2,FRAUD,"CREDIT CARD, THEFT BY USE OF",Wednesday,SOUTHERN,NONE,800 Block of BRYANT ST,122.3,0.659306,531205,2007,MonthMed,14,NIGHT,low,0,86.932229,-85.999971,122.283308,61.720959,-60.579041,86.482929,106.241453,-105.582147,149.78263
3,ROBBERY,ROBBERY OF A CHAIN STORE WITH A GUN,Thursday,BAYVIEW,DISTRICT ATTORNEY REFUSES TO PROSECUTE,2400 Block of SAN BRUNO AV,122.3,0.658516,523137,2007,MonthLow,27,DAY,low,0,86.931671,-86.000529,122.283304,61.720275,-60.579725,86.48292,106.241058,-105.582542,149.782628
4,OTHER OFFENSES,PROBATION VIOLATION,Sunday,SOUTHERN,"ARREST, BOOKED",4TH ST / STEVENSON ST,122.3,0.659478,200968,2012,MonthMed,9,DAY,low,0,86.932351,-85.999849,122.283309,61.721108,-60.578892,86.482931,106.241539,-105.582061,149.78263


# Address splitting

In [28]:
address=pd.DataFrame(df['Address'],columns=['Address'])
address=address.Address.str.split(pat=" /",expand=True )

address.columns=['Address','Intr2']

address=address.Address.str.split(pat=" /",expand=True )
address.columns=['Address']

In [29]:
address.head()

Unnamed: 0,Address
0,2100 Block of MISSION ST
1,1000 Block of MARKET ST
2,800 Block of BRYANT ST
3,2400 Block of SAN BRUNO AV
4,4TH ST


In [30]:
string=address.iloc[:,0]
string=string.str.strip()

In [31]:
address_fram=string.to_frame()
temp=address_fram['Address'].astype(str).str[-2:]

In [32]:
temp.unique()

array(['ST', 'AV', 'LN', 'DR', 'BL', 'HY', 'CT', 'RD', 'PL', 'PZ', '80',
       'TR', 'WY', 'AL', 'AY', 'ER', 'MS', 'CR', 'TI', 'WK', 'AR', 'EX'],
      dtype=object)

In [33]:
address=temp.to_frame()

("ST","AV","LN","DR","BL","HY","CT","RD","PL","PZ","TR","AL","CR","WK","EX","RW")

In [34]:
address['Address']=address['Address'].apply(lambda x :( x if x in ("ST","AV","LN","DR","BL","HY","CT","RD","PL","PZ","TR","AL","CR","WK","EX","RW") else (("I-80" if x in ("80") else ("HWY" if x in ("WY") else ("WAY" if x in ("AY") else ("TER" if x in ("ER") else ("ALMS" if x in ("MS") else ("MAR" if x in ("AR") else ("PARK" if x in ("RK") else ("STWY" if x in ("WY") else ("VIA" if x in ("NO") else ("BLOCK")))))))))))))

In [35]:
#address.to_csv("Address.csv", float_format = '%.5F')

In [36]:
df=df.drop(labels=['Address'],axis=1)

Concatenating "df" and "address"

In [37]:
df=pd.concat([address,df],axis=1)

In [38]:
Id=df['Id']
df=df.drop(['Y','Descript','Category','Resolution','Id','Second','rot45_X','rot45_Y','radial45','rot30_X','rot30_Y','radial30','rot60_X','rot60_Y','radial60'],axis=1)

In [39]:
df.head()

Unnamed: 0,Address,DayOfWeek,PdDistrict,X,Year,Month,Day,Hour,Minute
0,ST,Friday,MISSION,122.3,2013,MonthMed,28,DAY,high
1,ST,Thursday,SOUTHERN,122.3,2004,MonthMed,19,EARLY_MORNING,high
2,ST,Wednesday,SOUTHERN,122.3,2007,MonthMed,14,NIGHT,low
3,AV,Thursday,BAYVIEW,122.3,2007,MonthLow,27,DAY,low
4,ST,Sunday,SOUTHERN,122.3,2012,MonthMed,9,DAY,low


Label Encoded Columns :PdDistrict ,Address ,X ,Y

In [40]:
data['PdDistrict'].unique()

array(['MISSION', 'SOUTHERN', 'BAYVIEW', 'CENTRAL', 'INGLESIDE',
       'NORTHERN', 'RICHMOND', 'TARAVAL', 'TENDERLOIN', 'PARK'],
      dtype=object)

# Label Encoding

In [41]:
le = preprocessing.LabelEncoder()

"DayOf Week"

In [42]:
"""
le_res=le.fit_transform(df['DayOfWeek'])
Day=pd.DataFrame(le_res)
Day.columns=['DayOfWeek']
df=df.drop(labels=['DayOfWeek'],axis=1)
df=pd.concat([Day,df],axis=1)

"""

"\nle_res=le.fit_transform(df['DayOfWeek'])\nDay=pd.DataFrame(le_res)\nDay.columns=['DayOfWeek']\ndf=df.drop(labels=['DayOfWeek'],axis=1)\ndf=pd.concat([Day,df],axis=1)\n\n"

"PdDistrict"

In [43]:
"""
#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['PdDistrict'])
District=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
District.columns=['District']
df=df.drop(labels=['PdDistrict'],axis=1)
df=pd.concat([District,df],axis=1)

"""

"\n#le = preprocessing.LabelEncoder()\nle_res=le.fit_transform(df['PdDistrict'])\nDistrict=pd.DataFrame(le_res)\n\n#District=pd.get_dummies(df['PdDistrict'],drop_first=True)\nDistrict.columns=['District']\ndf=df.drop(labels=['PdDistrict'],axis=1)\ndf=pd.concat([District,df],axis=1)\n\n"

"Address"

In [44]:
le_res=le.fit_transform(df['Address'])
Address=pd.DataFrame(le_res)
Address.columns=['Address']
df=df.drop(labels=['Address'],axis=1)
df=pd.concat([Address,df],axis=1)


"Year"

In [45]:
#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['Year'])
Year=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
Year.columns=['Year']
df=df.drop(labels=['Year'],axis=1)
df=pd.concat([Year,df],axis=1)


"Month"

In [46]:

#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['Month'])
Month=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
Month.columns=['Month']
df=df.drop(labels=['Month'],axis=1)
df=pd.concat([Month,df],axis=1)



"Day"

In [47]:

#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['Day'])
Day=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
Day.columns=['Day']
df=df.drop(labels=['Day'],axis=1)
df=pd.concat([Day,df],axis=1)



"Hour"

In [48]:
"""
#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['Hour'])
Hour=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
Hour.columns=['Hour']
df=df.drop(labels=['Hour'],axis=1)
df=pd.concat([Hour,df],axis=1)

"""

"\n#le = preprocessing.LabelEncoder()\nle_res=le.fit_transform(df['Hour'])\nHour=pd.DataFrame(le_res)\n\n#District=pd.get_dummies(df['PdDistrict'],drop_first=True)\nHour.columns=['Hour']\ndf=df.drop(labels=['Hour'],axis=1)\ndf=pd.concat([Hour,df],axis=1)\n\n"

"Minute"

In [49]:
"""
#le = preprocessing.LabelEncoder()
le_res=le.fit_transform(df['Minute'])
Minute=pd.DataFrame(le_res)

#District=pd.get_dummies(df['PdDistrict'],drop_first=True)
Minute.columns=['Minute']
df=df.drop(labels=['Minute'],axis=1)
df=pd.concat([Minute,df],axis=1)
"""

"\n#le = preprocessing.LabelEncoder()\nle_res=le.fit_transform(df['Minute'])\nMinute=pd.DataFrame(le_res)\n\n#District=pd.get_dummies(df['PdDistrict'],drop_first=True)\nMinute.columns=['Minute']\ndf=df.drop(labels=['Minute'],axis=1)\ndf=pd.concat([Minute,df],axis=1)\n"

# One Hot Encoding

In [54]:
#'Year','Month','Day',
#'PdDistrict','DayOfWeek'
df=pd.get_dummies(df,columns=['DayOfWeek','PdDistrict','Hour','Minute'],drop_first=True)

In [55]:
df.shape

(100000, 24)

# Feature Scaling(Not used YET)

In [56]:

scaler=StandardScaler()
df['X']=scaler.fit_transform(df['X'].as_matrix)


TypeError: float() argument must be a string or a number, not 'method'

36 Crime Category are there 

# Applying different Models

In [50]:
#Independent Column
X=df
X.shape

(100000, 9)

In [51]:
#Dependent column
y.shape

(100000, 1)

In [52]:
#splitting the dataset into the training set and test set
#from sklearn import model_selection as ms

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=0)

In [53]:
"""from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_tranform(X_train)
X_test=sc_X.transform(X_test)"""

'from sklearn.preprocessing import StandardScaler\nsc_X=StandardScaler()\nX_train=sc_X.fit_tranform(X_train)\nX_test=sc_X.transform(X_test)'

In [57]:
X_dummy=X_train['Year']
regressor_OLS=sm.OLS(endog=y_train,exog=X_dummy).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,Category,R-squared:,0.552
Model:,OLS,Adj. R-squared:,0.552
Method:,Least Squares,F-statistic:,82420.0
Date:,"Tue, 13 Nov 2018",Prob (F-statistic):,0.0
Time:,22:33:25,Log-Likelihood:,-272390.0
No. Observations:,67000,AIC:,544800.0
Df Residuals:,66999,BIC:,544800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Year,2.3119,0.008,287.094,0.000,2.296,2.328

0,1,2,3
Omnibus:,1078.851,Durbin-Watson:,1.699
Prob(Omnibus):,0.0,Jarque-Bera (JB):,621.516
Skew:,0.034,Prob(JB):,1.1e-135
Kurtosis:,2.533,Cond. No.,1.0


# XGBoost Model

In [62]:
X.head()

Unnamed: 0,Day,Month,Year,Address,X,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,Hour_EARLY_MORNING,Hour_MORNING,Hour_NIGHT,Minute_low
0,27,2,10,17,122.3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,18,2,1,17,122.3,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0
2,13,2,4,17,122.3,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1
3,26,1,4,2,122.3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,8,2,9,17,122.3,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [1]:
max_depth_options=[7]
rate=[0.001,0.1,0.02]
for depth in max_depth_options :
    model = xgb.XGBClassifier(objective='multi:softprob',learning_rate=0.3)
    model.fit(X_train, y_train)
    y_pred=model.predict_proba(X_test)    
    print (log_loss(y_test,y_pred));

NameError: name 'xgb' is not defined

In [None]:
model = XGBClassifier(objective='multi:softprob')
learning_rate = [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate)
kfold = StratifiedKFold(n_splits=10)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train,y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



In [None]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

# Logistic Regression

In [320]:
from sklearn.linear_model import LogisticRegression
#weight={Address:3,District:3,X:1,Day:2}
#weight={LARCENY/THEFT:35}
classifier = LogisticRegression(penalty='l2',random_state = 0,class_weight='balanced',multi_class='multinomial', solver='lbfgs',n_jobs=-1,max_iter=100,tol=1e-3)
classifier.fit(X_train,y_train)
# Applying k-Fold Cross Validation
"""from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X=df , y=y , cv = 10,n_jobs=-1)
print("Logistic Regression:\n Accuracy:", accuracies.mean(), "+/-", accuracies.std(),"\n")
"""

'from sklearn.model_selection import cross_val_score\naccuracies = cross_val_score(estimator = classifier, X=df , y=y , cv = 10,n_jobs=-1)\nprint("Logistic Regression:\n Accuracy:", accuracies.mean(), "+/-", accuracies.std(),"\n")\n'

In [321]:
#predicting the Test set result
y_pred=classifier.predict_proba(X_test)

In [322]:
ll = log_loss(y_test,y_pred)
ll

3.5024857406249956

#  Random Forest 

In [108]:
from sklearn.ensemble import RandomForestClassifier

In [117]:
#sample_leaf_options = [100]
max_depth_options=[20]
for depth in max_depth_options :
    
    clf = RandomForestClassifier(max_depth=depth,n_estimators =200,criterion = 'entropy',random_state = 0, min_samples_leaf=100,oob_score = True,max_features = "auto")
    clf.fit(X_train,y_train)
    y_pred=clf.predict_proba(X_test)
    print (log_loss(y_test,y_pred));

  


2.4510212784930783


In [2]:
y.shape

NameError: name 'y' is not defined