In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_excel("stint_full_data.xlsx")
df.head(10)

Unnamed: 0,meeting_key,session_key,stint_number,driver_number,lap_start,lap_end,tyre_age_at_start,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET
0,1140,7763,1,4,1,2,7,0,0,1,0,0
1,1140,7763,1,20,1,2,10,0,0,1,0,0
2,1140,7763,1,2,1,3,15,0,0,0,1,0
3,1140,7763,1,14,1,3,0,0,0,1,0,0
4,1140,7763,1,24,1,3,10,0,0,1,0,0
5,1140,7763,1,16,1,4,0,0,0,1,0,0
6,1140,7763,1,22,1,4,12,0,0,1,0,0
7,1140,7763,1,63,1,4,11,0,0,1,0,0
8,1140,7763,1,31,1,5,5,0,0,1,0,0
9,1140,7763,1,81,1,5,0,0,0,1,0,0


In [3]:
df.shape

(22854, 12)

In [4]:
compound_cols = [
    'compound_SOFT',
    'compound_MEDIUM',
    'compound_HARD',
    'compound_INTERMEDIATE',
    'compound_WET'
]

max_life_map = {'compound_SOFT':15,
'compound_MEDIUM':20,
'compound_HARD':50,
'compound_INTERMEDIATE':100,
'compound_WET': 200}

In [5]:
df['MAX_LAPS'] = df[compound_cols].dot(
    pd.Series(max_life_map)
)

In [6]:
df.head(10)

Unnamed: 0,meeting_key,session_key,stint_number,driver_number,lap_start,lap_end,tyre_age_at_start,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS
0,1140,7763,1,4,1,2,7,0,0,1,0,0,20
1,1140,7763,1,20,1,2,10,0,0,1,0,0,20
2,1140,7763,1,2,1,3,15,0,0,0,1,0,15
3,1140,7763,1,14,1,3,0,0,0,1,0,0,20
4,1140,7763,1,24,1,3,10,0,0,1,0,0,20
5,1140,7763,1,16,1,4,0,0,0,1,0,0,20
6,1140,7763,1,22,1,4,12,0,0,1,0,0,20
7,1140,7763,1,63,1,4,11,0,0,1,0,0,20
8,1140,7763,1,31,1,5,5,0,0,1,0,0,20
9,1140,7763,1,81,1,5,0,0,0,1,0,0,20


In [7]:
df = df.drop(columns=['stint_number'])


In [8]:
df['TIRE_AGE'] = (df['lap_end'] - df['lap_start']) + df['tyre_age_at_start']

In [9]:
df = df.drop(columns=['lap_end','lap_start','tyre_age_at_start'])
df.head(10)

Unnamed: 0,meeting_key,session_key,driver_number,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS,TIRE_AGE
0,1140,7763,4,0,0,1,0,0,20,8
1,1140,7763,20,0,0,1,0,0,20,11
2,1140,7763,2,0,0,0,1,0,15,17
3,1140,7763,14,0,0,1,0,0,20,2
4,1140,7763,24,0,0,1,0,0,20,12
5,1140,7763,16,0,0,1,0,0,20,3
6,1140,7763,22,0,0,1,0,0,20,15
7,1140,7763,63,0,0,1,0,0,20,14
8,1140,7763,31,0,0,1,0,0,20,9
9,1140,7763,81,0,0,1,0,0,20,4


In [10]:
df['Percent_Degradation'] = (((df['TIRE_AGE'] / df['MAX_LAPS']) * 100 ).astype(float)).round(2)

In [11]:
df.head(10)

Unnamed: 0,meeting_key,session_key,driver_number,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS,TIRE_AGE,Percent_Degradation
0,1140,7763,4,0,0,1,0,0,20,8,40.0
1,1140,7763,20,0,0,1,0,0,20,11,55.0
2,1140,7763,2,0,0,0,1,0,15,17,113.33
3,1140,7763,14,0,0,1,0,0,20,2,10.0
4,1140,7763,24,0,0,1,0,0,20,12,60.0
5,1140,7763,16,0,0,1,0,0,20,3,15.0
6,1140,7763,22,0,0,1,0,0,20,15,75.0
7,1140,7763,63,0,0,1,0,0,20,14,70.0
8,1140,7763,31,0,0,1,0,0,20,9,45.0
9,1140,7763,81,0,0,1,0,0,20,4,20.0


In [12]:
df = df.drop(columns=['meeting_key','session_key','driver_number'])
df.head(10)

Unnamed: 0,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS,TIRE_AGE,Percent_Degradation
0,0,0,1,0,0,20,8,40.0
1,0,0,1,0,0,20,11,55.0
2,0,0,0,1,0,15,17,113.33
3,0,0,1,0,0,20,2,10.0
4,0,0,1,0,0,20,12,60.0
5,0,0,1,0,0,20,3,15.0
6,0,0,1,0,0,20,15,75.0
7,0,0,1,0,0,20,14,70.0
8,0,0,1,0,0,20,9,45.0
9,0,0,1,0,0,20,4,20.0


In [13]:
df.shape

(22854, 8)

In [14]:
df = df[df['Percent_Degradation'] <= 95]
df.shape

(21417, 8)

In [15]:
df.columns

Index(['compound_HARD', 'compound_INTERMEDIATE', 'compound_MEDIUM',
       'compound_SOFT', 'compound_WET', 'MAX_LAPS', 'TIRE_AGE',
       'Percent_Degradation'],
      dtype='object')

In [16]:
df.head(10)

Unnamed: 0,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS,TIRE_AGE,Percent_Degradation
0,0,0,1,0,0,20,8,40.0
1,0,0,1,0,0,20,11,55.0
3,0,0,1,0,0,20,2,10.0
4,0,0,1,0,0,20,12,60.0
5,0,0,1,0,0,20,3,15.0
6,0,0,1,0,0,20,15,75.0
7,0,0,1,0,0,20,14,70.0
8,0,0,1,0,0,20,9,45.0
9,0,0,1,0,0,20,4,20.0
10,0,0,1,0,0,20,14,70.0


In [18]:
df2 = pd.read_excel("weather_data.xlsx")
df2.shape

(31977, 4)

In [19]:
df2.columns

Index(['rainfall', 'humidity', 'track_temperature', 'air_temperature'], dtype='object')

In [20]:
df2 = df2[['track_temperature','air_temperature']]
df2.columns

Index(['track_temperature', 'air_temperature'], dtype='object')

In [22]:
df2 = df2[:21417]
df2.shape

(21417, 2)

In [23]:
df_combined = pd.concat([df, df2], axis=1)
df_combined.head(10)

Unnamed: 0,compound_HARD,compound_INTERMEDIATE,compound_MEDIUM,compound_SOFT,compound_WET,MAX_LAPS,TIRE_AGE,Percent_Degradation,track_temperature,air_temperature
0,0.0,0.0,1.0,0.0,0.0,20.0,8.0,40.0,29.7,23.8
1,0.0,0.0,1.0,0.0,0.0,20.0,11.0,55.0,29.7,23.8
3,0.0,0.0,1.0,0.0,0.0,20.0,2.0,10.0,30.1,23.8
4,0.0,0.0,1.0,0.0,0.0,20.0,12.0,60.0,30.1,23.8
5,0.0,0.0,1.0,0.0,0.0,20.0,3.0,15.0,30.3,23.8
6,0.0,0.0,1.0,0.0,0.0,20.0,15.0,75.0,30.4,23.9
7,0.0,0.0,1.0,0.0,0.0,20.0,14.0,70.0,30.4,23.9
8,0.0,0.0,1.0,0.0,0.0,20.0,9.0,45.0,30.7,24.0
9,0.0,0.0,1.0,0.0,0.0,20.0,4.0,20.0,30.8,24.0
10,0.0,0.0,1.0,0.0,0.0,20.0,14.0,70.0,30.8,24.0


### BRIEF

I am considering the percent degradation as the target variable. All others will be considered as independent variables collected by sensors and other data. The data is only for demonstration purposes on how a model pipeline can be developed for a tire degradation prediction model.

In [24]:
df_combined.columns

Index(['compound_HARD', 'compound_INTERMEDIATE', 'compound_MEDIUM',
       'compound_SOFT', 'compound_WET', 'MAX_LAPS', 'TIRE_AGE',
       'Percent_Degradation', 'track_temperature', 'air_temperature'],
      dtype='object')

In [28]:
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22730 entries, 0 to 21392
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   compound_HARD          21417 non-null  float64
 1   compound_INTERMEDIATE  21417 non-null  float64
 2   compound_MEDIUM        21417 non-null  float64
 3   compound_SOFT          21417 non-null  float64
 4   compound_WET           21417 non-null  float64
 5   MAX_LAPS               21417 non-null  float64
 6   TIRE_AGE               21417 non-null  float64
 7   Percent_Degradation    21417 non-null  float64
 8   track_temperature      21417 non-null  float64
 9   air_temperature        21417 non-null  float64
dtypes: float64(10)
memory usage: 1.9 MB


In [32]:
df_combined = df_combined.dropna(subset=['Percent_Degradation'])

feature_cols = [
    'compound_HARD', 'compound_INTERMEDIATE', 'compound_MEDIUM',
    'compound_SOFT', 'compound_WET',
    'track_temperature', 'air_temperature', 'MAX_LAPS', 'TIRE_AGE'
]
target_col = 'Percent_Degradation'

X = df_combined[feature_cols]


y = df_combined[target_col]

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Build a pipeline (optional: use RandomForest instead of LinearRegression)
pipeline = Pipeline([
    ('scaler', StandardScaler()),              # Scale temperature values
    ('regressor', RandomForestRegressor(random_state=42))  # Try LinearRegression() to compare
])

# Step 4: Train the model
pipeline.fit(X_train, y_train)

# Step 5: Predictions
y_pred = pipeline.predict(X_test)

# Step 6: Evaluation
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

Mean Absolute Error: 0.002395214752588559
Mean Squared Error: 0.0022028296241830042
R2 Score: 0.9999958058002455


### The good performance is due to including MAX_LAPS and TIRE_AGE as Percent_Degradation was derived from them. In real-world use case, it is assumed that type of tire compound, track temperatue, air_temperature, max laps, tire_age are available. Corresponding percent_degradation can be then modelled using historical or data collected from previous sessions.<br>
Again this is just a demonstration program