In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt

In [2]:
batteryData = pd.read_csv(r'C:\Users\snehal\PycharmProjects\BatteryHealthCheckML\data\Train_data.csv')

In [3]:
batteryData.head()

Unnamed: 0,Voltage,Current,Temperature,Capacity,SOC Capacity,SOC Percentage
0,4.18616,-0.03831,-0.73611,0.0,2.59028,0.999495
1,4.1838,-0.07662,-0.73611,0.0,2.59028,0.999495
2,4.18228,-0.08939,-0.73611,-1e-05,2.59027,0.999491
3,4.1811,-0.0945,-0.73611,-1e-05,2.59027,0.999491
4,4.18026,-0.0945,-0.73611,-1e-05,2.59027,0.999491


In [4]:
batteryData = batteryData.round({'Voltage':4, 'Temperature':4, 'Capacity':4, 'SOC Capacity':4, 'SOC Percentage':4}) 

In [5]:
batteryData.shape

(1094734, 6)

In [6]:
batteryData.columns

Index(['Voltage', 'Current', 'Temperature', 'Capacity', 'SOC Capacity',
       'SOC Percentage'],
      dtype='object')

In [7]:
# Check columns info
batteryData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094734 entries, 0 to 1094733
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   Voltage         1094734 non-null  float64
 1   Current         1094734 non-null  float64
 2   Temperature     1094734 non-null  float64
 3   Capacity        1094734 non-null  float64
 4   SOC Capacity    1094734 non-null  float64
 5   SOC Percentage  1094734 non-null  float64
dtypes: float64(6)
memory usage: 50.1 MB


In [8]:
# Check match df stats
batteryData.describe()

Unnamed: 0,Voltage,Current,Temperature,Capacity,SOC Capacity,SOC Percentage
count,1094734.0,1094734.0,1094734.0,1094734.0,1094734.0,1094734.0
mean,3.682971,-1.189149,9.331152,-1.223793,1.366487,0.527278
std,0.2797686,2.45386,8.90049,0.7572624,0.7572633,0.2921991
min,2.7927,-18.09828,-0.8413,-2.5903,0.0,0.0
25%,3.4784,-2.33444,0.3155,-1.8113,0.779,0.3006
50%,3.6808,-0.61554,9.4643,-1.2738,1.3165,0.508
75%,3.9126,0.0,10.5158,-0.5251,2.0652,0.7969
max,4.2399,6.00472,26.1845,0.0013,2.5916,1.0


In [9]:
# Check null entries
batteryData.isna().sum()

Voltage           0
Current           0
Temperature       0
Capacity          0
SOC Capacity      0
SOC Percentage    0
dtype: int64

In [10]:
# Check duplicate entries
batteryData.duplicated().sum()

210404

In [11]:
# Drop duplicate rows
batteryData = batteryData.drop_duplicates()

In [12]:
# Check duplicate entries
batteryData.duplicated().sum()

0

In [13]:
features = ['Voltage', 'Current', 'Temperature', 'Capacity']
targetVariable='SOC Percentage'

In [14]:
X = batteryData[features]
X

Unnamed: 0,Voltage,Current,Temperature,Capacity
0,4.1862,-0.03831,-0.7361,0.0000
1,4.1838,-0.07662,-0.7361,0.0000
2,4.1823,-0.08939,-0.7361,-0.0000
3,4.1811,-0.09450,-0.7361,-0.0000
4,4.1803,-0.09450,-0.7361,-0.0000
...,...,...,...,...
1094312,3.2278,0.00000,23.9762,-2.5583
1094396,3.2280,0.00000,23.9762,-2.5583
1094481,3.2282,0.00000,23.9762,-2.5583
1094569,3.2284,0.00000,23.9762,-2.5583


In [15]:
y = batteryData[targetVariable].values
y = y.astype('float')
y

array([0.9995, 0.9995, 0.9995, ..., 0.0124, 0.0124, 0.0124])

In [16]:
from sklearn.preprocessing import StandardScaler

predictorScaler = StandardScaler()

predictorScalerFit = predictorScaler.fit(X)

X_enc = predictorScalerFit.transform(X)
X_enc

array([[ 1.78294432,  0.52563602, -1.13025477,  1.60406085],
       [ 1.77414383,  0.51124957, -1.13025477,  1.60406085],
       [ 1.76864352,  0.50645409, -1.13025477,  1.60406085],
       ...,
       [-1.72991903,  0.54002247,  1.65230663, -1.99681285],
       [-1.72918566,  0.54002247,  1.65230663, -1.99681285],
       [-1.72881897,  0.54002247,  1.65230663, -1.99681285]])

In [17]:
# Splitting the data into trainand test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)
print('X_train.shape - ',X_train.shape)
print('X_test.shape - ',X_test.shape)
print('y_train.shape - ',y_train.shape)
print('y_test.shape - ',y_test.shape)

X_train.shape -  (707464, 4)
X_test.shape -  (176866, 4)
y_train.shape -  (707464,)
y_test.shape -  (176866,)


In [18]:
y_test.max()

1.0

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
import xgboost as xg 

In [31]:
regressor = [
    LinearRegression(),
    DecisionTreeRegressor(),
    AdaBoostRegressor(),
    RandomForestRegressor(),
    xg.XGBRegressor() 
]

name = []
train_acc = []
test_acc = []
models = []
rmse = []

i = 0
for regression in regressor:
    regression.fit(X_train, y_train)
    name.append(type(regression).__name__)
    train_acc.append(regression.score(X_train, y_train))
    test_acc.append(regression.score(X_test, y_test))
    models.append(regression)
    rmse.append(np.sqrt(mean_squared_error(regression.predict(X_test), y_test)))

In [32]:
df_score = pd.DataFrame(list(zip(name, train_acc, test_acc, rmse, models)), columns=['name', 'train_acc', 'test_acc', 'rmse', 'models'])
df_score.set_index('name', inplace=True)
df_score.sort_values(by=['test_acc'], inplace=True)
df_score

Unnamed: 0_level_0,train_acc,test_acc,rmse,models
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AdaBoostRegressor,0.989618,0.989592,0.027975,"(DecisionTreeRegressor(max_depth=3, random_sta..."
XGBRegressor,0.999984,0.999984,0.001091,"XGBRegressor(base_score=None, booster=None, ca..."
DecisionTreeRegressor,1.0,1.0,3.7e-05,DecisionTreeRegressor()
LinearRegression,1.0,1.0,3.1e-05,LinearRegression()
RandomForestRegressor,1.0,1.0,3e-05,"(DecisionTreeRegressor(max_features=1.0, rando..."


In [33]:
X_test

array([[-0.01895657,  0.04127343,  1.65230663, -0.34803693],
       [-0.3618091 ,  0.50357379, -0.94080824, -0.55438023],
       [ 0.92966321, -0.18795594,  0.01829228,  1.12085718],
       ...,
       [-0.13886328, -0.54475567,  0.01829228, -0.21896679],
       [-1.72625216, -0.41239357, -0.94080824, -1.65098372],
       [-0.77103201, -1.40509243, -0.85792467, -0.29441019]])

In [34]:
#Final model
model = df_score.loc["RandomForestRegressor", "models"]
prediction = model.predict(X_test)
prediction

array([0.46431367, 0.4078    , 0.867     , ..., 0.4997    , 0.107116  ,
       0.479     ])

In [35]:
prediction.max()

1.0

In [36]:
predictorScalerFit.inverse_transform(X_test)

array([[ 3.6948 , -1.32813, 23.9762 , -1.3869 ],
       [ 3.6013 , -0.09706,  0.9464 , -1.5335 ],
       [ 3.9535 , -1.93855,  9.4643 , -0.3433 ],
       ...,
       [ 3.6621 , -2.88868,  9.4643 , -1.2952 ],
       [ 3.2292 , -2.53621,  0.9464 , -2.3126 ],
       [ 3.4897 , -5.17969,  1.6825 , -1.3488 ]])

In [37]:
import pickle
pickle.dump(model,open('BatteryHealthPredictorModel.pkl','wb'))