In [1]:
## Key Imports
import pandas as pd 
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import os
from sklearn.ensemble import RandomForestClassifier

In [2]:
## Loading The Data
baseDataDir = r"/home/parth/Machine Learning/Datasets/North Corp/Ajanta/DMK"
baseTripDataDir = r"/home/parth/Machine Learning/Datasets/North Corp/Ajanta"
file_list = os.listdir(baseDataDir)
# Loading First File
file = file_list[0] 
file_path = os.path.join(baseDataDir , file)
data = pd.read_csv(file_path,delimiter=';')
tripData = pd.read_csv(os.path.join(baseTripDataDir , "trip" , "Trip_MC2M1PRC0GJ009702.csv"))

In [3]:
def getRange(d1 , d2):
    obj1 = datetime.strptime(d1 , "%Y-%m-%d %H:%M:%S")
    t1 = (obj1 - datetime(1970 , 1 , 1)).total_seconds()
    obj2 = datetime.strptime(d2 , "%d/%m/%Y %H:%M")
    t2 = (obj2 - datetime(1970 , 1 , 1)).total_seconds()
    return range(int(t1) , int(np.ceil(t2)) + 1)

In [4]:
df = data.sort_values("updateddate" , ascending=True)
df["timeObject"] = df.updateddate.apply(lambda x: datetime.strptime(x , "%Y-%m-%d %H:%M:%S"))

In [6]:
df["fuelEfficiency"] = df.total_fuel_consumption.diff() / df.total_distance.diff()
df["time"] = df.timeObject.apply(lambda x : x - datetime(2000 ,1 , 1)).apply(lambda x : x.total_seconds())
df = df.sort_values("time")
df["AvgSpeed"] = df.total_distance.diff() / df.timeObject.diff().apply(lambda x: x.total_seconds())
important_cols = [ 'fuelEfficiency' , 'current_fuel_level' , 'speed' , 'time_sweetspot' , "AvgSpeed" , "timeObject"]
dfImportant = df[important_cols]
dfImportant["idleDiff"] = df.time_idle.diff()
dfImportant["idleDiff"].iloc[0] = 0
not_considering_mask = np.logical_or(
    np.logical_or(
        df.fuelEfficiency.isna()  , 
        np.isinf(df.fuelEfficiency)
    ) 
    , 
    np.logical_or(
        np.isinf(dfImportant.AvgSpeed) ,
        np.isnan(dfImportant.AvgSpeed)
    )
)
dfImportant = dfImportant[np.logical_not(not_considering_mask)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
dfImportant.corr().to_csv("LinearCorrelationsWithFE.csv")

In [8]:
def makeBins(x):
    fuel_range = x.max() - x.min()
    binSize = fuel_range / 40
    def makeClasses(x):
        return x // binSize
    return makeClasses

In [9]:
binfunc = makeBins(dfImportant.fuelEfficiency)
dfImportant["feClass"] = binfunc(dfImportant.fuelEfficiency)
mask = dict( dfImportant.feClass.value_counts().sort_index() < 20 )
true_mask = np.array([mask[x] for x in dfImportant.feClass])
true_mask = np.logical_not(true_mask)
dfImportant = dfImportant[true_mask]

In [10]:
importances = []
scores = []
times = [ (a,b) for a , b in zip(tripData["Start_Date"] , tripData["End_Date"])]
searchs = [getRange(*time) for time in times]
temp = dfImportant.timeObject.apply(
                lambda x:  (x - datetime(1970 , 1 , 1)).total_seconds() 
            )
dfImportant.drop(columns="timeObject" , inplace=True)
rnd_clf = RandomForestClassifier(n_jobs=-1 , n_estimators=100 , max_depth=5 , oob_score=True)
for search in searchs:
    dfSearch = dfImportant[ temp.isin(search) ]
    X = dfImportant[['current_fuel_level' , 'speed' , 'time_sweetspot' , "AvgSpeed" , "idleDiff"]]
    y = dfImportant[["feClass"]]
    rnd_clf.fit(X , y.values.ravel())
    importances.append(rnd_clf.feature_importances_)
    scores.append(rnd_clf.oob_score_)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [12]:
cols = [ x + "_IMP" for x in ['current_fuel_level' , 'speed' , 'time_sweetspot' , "AvgSpeed" , "idleDiff"]]
importances_dat = pd.DataFrame(np.array(importances) , columns=cols)
scores_dat = pd.DataFrame(scores , columns=["Model Accuracy"])

In [13]:
tripFinal = tripData.join([importances_dat , scores_dat])

In [16]:
tripFinal.to_csv("TripWise FE analysis.csv")

In [15]:
tripFinal

Unnamed: 0.1,Unnamed: 0,Start_Date,End_Date,day_diff,Start_Km,End_Km,delta_total_distance,delta_Fuel_consumption,delta_Engine_Hours,driveTimeDelta,...,sweetspot_time_%,Daily_disance_cover,daily_running_hours,idleTimeDelta,current_fuel_level_IMP,speed_IMP,time_sweetspot_IMP,AvgSpeed_IMP,idleDiff_IMP,Model Accuracy
0,1779,2019-01-08 16:52:00,11/01/2019 8:47,2.663194,77232.41,77688.8,456.39,111.49,14.95,13.85,...,25.752508,171.369387,5.613559,1.11,0.041011,0.237965,0.016586,0.652267,0.052171,0.695644
1,2345,2019-01-11 16:37:00,15/01/2019 5:27,3.534722,77698.66,78056.43,357.77,94.81,12.25,11.5,...,22.44898,101.215874,3.465619,0.73,0.040404,0.244561,0.017229,0.648155,0.04965,0.694537
2,7295,2019-01-15 14:17:00,12/02/2019 6:37,27.680556,78065.58,80989.71,2924.13,751.08,98.55,90.65,...,25.114155,105.638414,3.560261,7.86,0.040852,0.196081,0.020326,0.696505,0.046236,0.694629
3,7954,2019-02-12 14:02:00,14/02/2019 5:37,1.649306,80999.0,81311.79,312.79,88.26,9.05,8.3,...,25.414365,189.649516,5.487158,0.75,0.041363,0.235568,0.020049,0.650344,0.052676,0.695367
4,8633,2019-02-14 13:22:00,20/02/2019 6:42,5.722222,81321.96,81754.9,432.94,106.55,13.45,12.85,...,24.535316,75.659417,2.350485,0.63,0.043397,0.199524,0.017141,0.687821,0.052116,0.695367
5,9226,2019-02-20 17:42:00,23/02/2019 5:57,2.510417,81762.81,82123.51,360.7,92.95,11.55,11.1,...,23.809524,143.681328,4.60083,0.44,0.039184,0.213404,0.01792,0.674821,0.05467,0.696752
6,10412,2019-02-23 16:27:00,28/02/2019 5:52,4.559028,82131.99,83113.66,981.67,245.77,29.25,27.85,...,27.179487,215.324417,6.415842,1.45,0.039492,0.230096,0.017664,0.65218,0.060567,0.695736
7,12016,2019-02-28 11:02:00,11/03/2019 7:02,10.833333,83122.36,84069.0,946.64,253.2,31.45,29.1,...,24.642289,87.382154,2.903077,2.35,0.043612,0.208794,0.01647,0.677629,0.053496,0.693891
8,12654,2019-03-11 14:27:00,14/03/2019 6:22,2.663194,84077.73,84447.53,369.8,99.38,12.35,11.4,...,23.076923,138.855802,4.637288,0.91,0.038546,0.221187,0.018481,0.670466,0.05132,0.695644
9,13902,2019-03-15 17:07:00,20/03/2019 6:12,4.545139,84559.41,85180.8,621.39,157.91,19.8,18.1,...,24.242424,136.715294,4.356303,1.69,0.041991,0.258574,0.019028,0.62903,0.051378,0.697121
