In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import os, glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
# Importing the dataset
filename = "C:/Users/User/Desktop/smbp_mass.csv"
# parse_dates are used to add date and H. in one column 
mass_df = pd.read_csv(filename, parse_dates=[['Date', 'H.']])
mass_df= mass_df[['Date_H.','Brut']]

In [3]:
#Renaming the column from Date_H. , Brut to time, mass
mass_df.columns = ['time', 'mass']
#considering on those elements whose mass >0 
mass_df=mass_df[mass_df['mass']>0]
mass_df

Unnamed: 0,time,mass
0,01-10-2019 06:08:00,43.70
1,01-10-2019 06:12:00,43.80
2,01-10-2019 06:17:00,43.25
3,01-10-2019 06:23:00,44.40
4,01-10-2019 06:28:00,45.00
...,...,...
1451,30-09-2019 16:16:00,43.45
1452,30-09-2019 16:20:00,44.30
1453,30-09-2019 16:24:00,39.35
1454,30-09-2019 16:26:00,41.40


In [4]:
# Converting time to datetime format 
mass_df['time'] = pd.to_datetime(mass_df.time)
mass_df

Unnamed: 0,time,mass
0,2019-01-10 06:08:00,43.70
1,2019-01-10 06:12:00,43.80
2,2019-01-10 06:17:00,43.25
3,2019-01-10 06:23:00,44.40
4,2019-01-10 06:28:00,45.00
...,...,...
1451,2019-09-30 16:16:00,43.45
1452,2019-09-30 16:20:00,44.30
1453,2019-09-30 16:24:00,39.35
1454,2019-09-30 16:26:00,41.40


In [5]:
# converting datetime to numeric and diving by 1000000000 to get timestamp
mass_df['timestamp'] = pd.to_numeric(mass_df['time'])
mass_df['timestamp'] = mass_df['timestamp'].apply(lambda x : x/1000000000)
mass_df

Unnamed: 0,time,mass,timestamp
0,2019-01-10 06:08:00,43.70,1.547100e+09
1,2019-01-10 06:12:00,43.80,1.547101e+09
2,2019-01-10 06:17:00,43.25,1.547101e+09
3,2019-01-10 06:23:00,44.40,1.547101e+09
4,2019-01-10 06:28:00,45.00,1.547102e+09
...,...,...,...
1451,2019-09-30 16:16:00,43.45,1.569860e+09
1452,2019-09-30 16:20:00,44.30,1.569860e+09
1453,2019-09-30 16:24:00,39.35,1.569861e+09
1454,2019-09-30 16:26:00,41.40,1.569861e+09


In [6]:
#converting scientific to integer
mass_df['timestamp'] = mass_df['timestamp'].astype('int64')
mass_df['mass'].dtype

dtype('float64')

In [7]:
#convert mass from T to Kg
mass_df['mass'] = mass_df['mass'].apply(lambda x : x*1000)
mass_df

Unnamed: 0,time,mass,timestamp
0,2019-01-10 06:08:00,43700.0,1547100480
1,2019-01-10 06:12:00,43800.0,1547100720
2,2019-01-10 06:17:00,43250.0,1547101020
3,2019-01-10 06:23:00,44400.0,1547101380
4,2019-01-10 06:28:00,45000.0,1547101680
...,...,...,...
1451,2019-09-30 16:16:00,43450.0,1569860160
1452,2019-09-30 16:20:00,44300.0,1569860400
1453,2019-09-30 16:24:00,39350.0,1569860640
1454,2019-09-30 16:26:00,41400.0,1569860760


In [8]:
#load csv files in the dataframe and append all signal files in signal_df
path = "C:/Users/User/Desktop/Signal"

all_files = glob.glob(os.path.join(path, "*.csv"))

all_df = []
for f in all_files:
    df = pd.read_csv(f, sep=',')
    df['file'] = f.split('/')[-1]
    all_df.append(df)
    
signal_df = pd.concat(all_df, ignore_index=True, sort=True)

In [9]:
#drop the unwanted columns
signal_df = signal_df.drop(['file', 'channel_65','channel_66','channel_67'], axis=1)

In [10]:
signal_df

Unnamed: 0.1,Unnamed: 0,channel_1,channel_10,channel_11,channel_12,channel_13,channel_14,channel_15,channel_16,channel_17,...,channel_59,channel_6,channel_60,channel_61,channel_62,channel_63,channel_64,channel_7,channel_8,channel_9
0,2019-09-26 10:16:41.020,0.000,0.0,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000
1,2019-09-26 10:16:41.021,-0.005,0.0,2.000000e-03,-0.004,-0.010,-0.003,-0.010,-0.010,-0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057,0.0,0.003
2,2019-09-26 10:16:41.022,-0.009,0.0,1.000000e-03,-0.008,-0.008,0.009,0.001,-0.010,-0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079,0.0,0.004
3,2019-09-26 10:16:41.023,-0.006,0.0,-4.000000e-03,-0.012,-0.008,-0.001,-0.012,-0.013,-0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.0,0.004
4,2019-09-26 10:16:41.024,-0.004,0.0,-7.000000e-03,-0.013,-0.026,-0.008,-0.019,-0.014,-0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107,0.0,-0.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140835,2019-09-30 14:57:22.995,0.052,0.0,-4.000000e-03,-0.274,-0.449,-0.473,-0.325,-0.238,-0.155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245,0.0,0.009
2140836,2019-09-30 14:57:22.996,0.054,0.0,-1.000000e-03,-0.281,-0.423,-0.436,-0.306,-0.239,-0.153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220,0.0,0.006
2140837,2019-09-30 14:57:22.997,0.051,0.0,8.000000e-03,-0.276,-0.400,-0.418,-0.300,-0.255,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.0,0.001
2140838,2019-09-30 14:57:22.998,0.056,0.0,-1.136868e-13,-0.278,-0.374,-0.386,-0.285,-0.253,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158,0.0,0.001


In [11]:
# rename Unnamed:0 to time
signal_df = signal_df.rename(columns={'Unnamed: 0': 'time'})

In [12]:
signal_df

Unnamed: 0,time,channel_1,channel_10,channel_11,channel_12,channel_13,channel_14,channel_15,channel_16,channel_17,...,channel_59,channel_6,channel_60,channel_61,channel_62,channel_63,channel_64,channel_7,channel_8,channel_9
0,2019-09-26 10:16:41.020,0.000,0.0,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000
1,2019-09-26 10:16:41.021,-0.005,0.0,2.000000e-03,-0.004,-0.010,-0.003,-0.010,-0.010,-0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057,0.0,0.003
2,2019-09-26 10:16:41.022,-0.009,0.0,1.000000e-03,-0.008,-0.008,0.009,0.001,-0.010,-0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079,0.0,0.004
3,2019-09-26 10:16:41.023,-0.006,0.0,-4.000000e-03,-0.012,-0.008,-0.001,-0.012,-0.013,-0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.0,0.004
4,2019-09-26 10:16:41.024,-0.004,0.0,-7.000000e-03,-0.013,-0.026,-0.008,-0.019,-0.014,-0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107,0.0,-0.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140835,2019-09-30 14:57:22.995,0.052,0.0,-4.000000e-03,-0.274,-0.449,-0.473,-0.325,-0.238,-0.155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.245,0.0,0.009
2140836,2019-09-30 14:57:22.996,0.054,0.0,-1.000000e-03,-0.281,-0.423,-0.436,-0.306,-0.239,-0.153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220,0.0,0.006
2140837,2019-09-30 14:57:22.997,0.051,0.0,8.000000e-03,-0.276,-0.400,-0.418,-0.300,-0.255,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.0,0.001
2140838,2019-09-30 14:57:22.998,0.056,0.0,-1.136868e-13,-0.278,-0.374,-0.386,-0.285,-0.253,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158,0.0,0.001


In [13]:
#convert to datetime format 
signal_df['time'] = pd.to_datetime(signal_df.time)

In [14]:
# converting datetime to numeric and diving by 1000000000 to get timestamp
signal_df['timestamp'] = pd.to_numeric(signal_df['time'])
signal_df['timestamp'] = signal_df['timestamp'].apply(lambda x : x/1000000000)
signal_df

Unnamed: 0,time,channel_1,channel_10,channel_11,channel_12,channel_13,channel_14,channel_15,channel_16,channel_17,...,channel_6,channel_60,channel_61,channel_62,channel_63,channel_64,channel_7,channel_8,channel_9,timestamp
0,2019-09-26 10:16:41.020,0.000,0.0,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,1.569493e+09
1,2019-09-26 10:16:41.021,-0.005,0.0,2.000000e-03,-0.004,-0.010,-0.003,-0.010,-0.010,-0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.057,0.0,0.003,1.569493e+09
2,2019-09-26 10:16:41.022,-0.009,0.0,1.000000e-03,-0.008,-0.008,0.009,0.001,-0.010,-0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.079,0.0,0.004,1.569493e+09
3,2019-09-26 10:16:41.023,-0.006,0.0,-4.000000e-03,-0.012,-0.008,-0.001,-0.012,-0.013,-0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.0,0.004,1.569493e+09
4,2019-09-26 10:16:41.024,-0.004,0.0,-7.000000e-03,-0.013,-0.026,-0.008,-0.019,-0.014,-0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.107,0.0,-0.004,1.569493e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140835,2019-09-30 14:57:22.995,0.052,0.0,-4.000000e-03,-0.274,-0.449,-0.473,-0.325,-0.238,-0.155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.245,0.0,0.009,1.569855e+09
2140836,2019-09-30 14:57:22.996,0.054,0.0,-1.000000e-03,-0.281,-0.423,-0.436,-0.306,-0.239,-0.153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.220,0.0,0.006,1.569855e+09
2140837,2019-09-30 14:57:22.997,0.051,0.0,8.000000e-03,-0.276,-0.400,-0.418,-0.300,-0.255,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.0,0.001,1.569855e+09
2140838,2019-09-30 14:57:22.998,0.056,0.0,-1.136868e-13,-0.278,-0.374,-0.386,-0.285,-0.253,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158,0.0,0.001,1.569855e+09


In [15]:
#convert mass from T to Kg
signal_df['timestamp'] = signal_df['timestamp'].astype('int64')
signal_df

Unnamed: 0,time,channel_1,channel_10,channel_11,channel_12,channel_13,channel_14,channel_15,channel_16,channel_17,...,channel_6,channel_60,channel_61,channel_62,channel_63,channel_64,channel_7,channel_8,channel_9,timestamp
0,2019-09-26 10:16:41.020,0.000,0.0,0.000000e+00,0.000,0.000,0.000,0.000,0.000,0.000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.000,1569493001
1,2019-09-26 10:16:41.021,-0.005,0.0,2.000000e-03,-0.004,-0.010,-0.003,-0.010,-0.010,-0.009,...,0.0,0.0,0.0,0.0,0.0,0.0,0.057,0.0,0.003,1569493001
2,2019-09-26 10:16:41.022,-0.009,0.0,1.000000e-03,-0.008,-0.008,0.009,0.001,-0.010,-0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.079,0.0,0.004,1569493001
3,2019-09-26 10:16:41.023,-0.006,0.0,-4.000000e-03,-0.012,-0.008,-0.001,-0.012,-0.013,-0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.103,0.0,0.004,1569493001
4,2019-09-26 10:16:41.024,-0.004,0.0,-7.000000e-03,-0.013,-0.026,-0.008,-0.019,-0.014,-0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.107,0.0,-0.004,1569493001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140835,2019-09-30 14:57:22.995,0.052,0.0,-4.000000e-03,-0.274,-0.449,-0.473,-0.325,-0.238,-0.155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.245,0.0,0.009,1569855442
2140836,2019-09-30 14:57:22.996,0.054,0.0,-1.000000e-03,-0.281,-0.423,-0.436,-0.306,-0.239,-0.153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.220,0.0,0.006,1569855442
2140837,2019-09-30 14:57:22.997,0.051,0.0,8.000000e-03,-0.276,-0.400,-0.418,-0.300,-0.255,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.189,0.0,0.001,1569855442
2140838,2019-09-30 14:57:22.998,0.056,0.0,-1.136868e-13,-0.278,-0.374,-0.386,-0.285,-0.253,-0.163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.158,0.0,0.001,1569855442


In [16]:
#merging the mass we receive with the correct passage in the signals folder based on the timestamp
mass_signal_map = pd.merge(mass_df,  
                      signal_df,  
                      on ='timestamp',  
                      how ='inner')
mass_signal_map

Unnamed: 0,time_x,mass,timestamp,time_y,channel_1,channel_10,channel_11,channel_12,channel_13,channel_14,...,channel_59,channel_6,channel_60,channel_61,channel_62,channel_63,channel_64,channel_7,channel_8,channel_9
0,2019-09-26 14:26:00,44000.0,1569507960,2019-09-26 14:26:00.000,-0.132,0.0,-0.158,-0.159,0.418,0.429,...,-8.526513e-14,0.0,0.0,0.0,0.0,0.0,0.0,-0.691,0.0,-0.372
1,2019-09-26 14:26:00,44000.0,1569507960,2019-09-26 14:26:00.001,-0.138,0.0,-0.155,-0.156,0.411,0.422,...,-8.526513e-14,0.0,0.0,0.0,0.0,0.0,0.0,-0.696,0.0,-0.366
2,2019-09-26 14:26:00,44000.0,1569507960,2019-09-26 14:26:00.002,-0.134,0.0,-0.147,-0.156,0.418,0.413,...,-8.526513e-14,0.0,0.0,0.0,0.0,0.0,0.0,-0.731,0.0,-0.364
3,2019-09-26 14:26:00,44000.0,1569507960,2019-09-26 14:26:00.003,-0.139,0.0,-0.135,-0.141,0.384,0.376,...,-8.526513e-14,0.0,0.0,0.0,0.0,0.0,0.0,-0.704,0.0,-0.356
4,2019-09-26 14:26:00,44000.0,1569507960,2019-09-26 14:26:00.004,-0.137,0.0,-0.120,-0.132,0.373,0.347,...,-8.526513e-14,0.0,0.0,0.0,0.0,0.0,0.0,-0.667,0.0,-0.331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,2019-09-30 13:31:00,44000.0,1569850260,2019-09-30 13:31:00.995,0.084,0.0,0.313,-0.129,0.011,0.081,...,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.384,0.0,-0.049
5996,2019-09-30 13:31:00,44000.0,1569850260,2019-09-30 13:31:00.996,0.078,0.0,0.304,-0.137,0.017,0.093,...,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.362,0.0,-0.056
5997,2019-09-30 13:31:00,44000.0,1569850260,2019-09-30 13:31:00.997,0.082,0.0,0.285,-0.155,0.030,0.107,...,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.335,0.0,-0.071
5998,2019-09-30 13:31:00,44000.0,1569850260,2019-09-30 13:31:00.998,0.085,0.0,0.291,-0.150,0.077,0.161,...,0.000000e+00,0.0,0.0,0.0,0.0,0.0,0.0,0.331,0.0,-0.070


In [17]:
#drop the unwanted columns
mass_signal_map = mass_signal_map.drop(['time_x', 'time_y'], axis=1)
mass_signal_map['mass'].dtype

dtype('float64')

In [24]:
# Y is mass(Dependent variable) and X is everything excluding mass(Independent variable)
Y=mass_signal_map.iloc[:,0].values
X=mass_signal_map.iloc[:,1:66].values
x[0]

array([ 1.56950796e+09, -1.32000000e-01,  0.00000000e+00, -1.58000000e-01,
       -1.59000000e-01,  4.18000000e-01,  4.29000000e-01,  2.13000000e-01,
       -7.70000000e-02, -4.00000000e-02,  1.29000000e-01, -1.62000000e-01,
        1.80000000e-01, -1.26000000e-01, -7.00000000e-02, -1.60000000e-02,
       -2.38000000e-01, -8.80000000e-02, -1.70000000e-01, -2.00000000e-03,
        2.33000000e-01,  9.80000000e-02,  2.93000000e-01, -7.00000000e-02,
        2.73000000e-01,  2.59000000e-01,  8.90000000e-02,  6.90000000e-02,
       -1.32000000e-01, -8.10000000e-02, -1.39000000e-01,  3.90000000e-02,
        1.18000000e-01,  6.70000000e-02, -8.00000000e-02, -7.00000000e-02,
       -8.90000000e-02, -2.07000000e-01, -2.53000000e-01,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -4.40000000e-01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  2.04000000e-01,  1.04000000e-01, -9.00000000e-03,
       -2.08000000e-01,  

In [19]:
# Encoding categorical data
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
print(X)
ct = ColumnTransformer([("timestamp", OneHotEncoder(), [0])], remainder = 'passthrough')
X = ct.fit_transform(X)

print(X)

[[ 0.    -0.132  0.    ... -0.691  0.    -0.372]
 [ 0.    -0.138  0.    ... -0.696  0.    -0.366]
 [ 0.    -0.134  0.    ... -0.731  0.    -0.364]
 ...
 [ 5.     0.082  0.    ...  0.335  0.    -0.071]
 [ 5.     0.085  0.    ...  0.331  0.    -0.07 ]
 [ 5.     0.08   0.    ...  0.367  0.    -0.111]]
[[ 1.     0.     0.    ... -0.691  0.    -0.372]
 [ 1.     0.     0.    ... -0.696  0.    -0.366]
 [ 1.     0.     0.    ... -0.731  0.    -0.364]
 ...
 [ 0.     0.     0.    ...  0.335  0.    -0.071]
 [ 0.     0.     0.    ...  0.331  0.    -0.07 ]
 [ 0.     0.     0.    ...  0.367  0.    -0.111]]


In [25]:
# Avoiding the Dummy Variable Trap
#X = X[:, 1:]
#print(X)
#splitting the dataset as training and testing dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state=0)


# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)
#Actual Value 
print(Y_test)
#Prediced Value
print(y_pred)

[43400. 43800. 44000. ... 43800. 44000. 43400.]
[43449.38244224 43819.12391299 44003.79467302 ... 43769.63778227
 44018.1787659  43391.66868909]


In [26]:
#testing the accuracy of the data
r2_score(Y_test, y_pred)

0.9992517911471425