In [110]:
#load the data
import pandas as pd
import numpy as np
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

target_ids = np.array(test['ID']).flatten()
#drop ID column
train = train.drop(['ID'], axis=1)
test = test.drop(['ID'], axis=1)


In [111]:
#we find that there are some molecules which are not classified as any parentspecies
# the only column with missing data is the ParentSpecies column
nan_columns_train = train.isnull().sum()
nan_columns_test = test.isnull().sum()
print(f'Train: {nan_columns_train[nan_columns_train > 0]}')
print(f'Test: {nan_columns_test[nan_columns_test > 0]}')

Train: parentspecies    210
dtype: int64
Test: parentspecies    33
dtype: int64


In [112]:
#also, there are few counts for some categorical values in parentspecies
train['parentspecies'].value_counts()

parentspecies
toluene                17950
apin                    6165
decane                  2218
apin_decane               46
apin_toluene              37
apin_decane_toluene        9
decane_toluene             2
Name: count, dtype: int64

In [113]:
#we decide not the drop them as there are other categorical 
#features that also have a low count and we decide 
#to group them all together into a new categorical value called 
# 'Other'
train['parentspecies'] = train['parentspecies'].fillna('other')
rare_species = train['parentspecies'].value_counts()[train['parentspecies'].value_counts() < 50].index
train['parentspecies'] = train['parentspecies'].replace(rare_species, 'other')
train['parentspecies'].value_counts()  

parentspecies
toluene    17950
apin        6165
decane      2218
other        304
Name: count, dtype: int64

In [114]:
#same for test data
test['parentspecies'] = test['parentspecies'].fillna('other')
rare_species = test['parentspecies'].value_counts()[test['parentspecies'].value_counts() < 10].index
test['parentspecies'] = test['parentspecies'].replace(rare_species, 'other')
test['parentspecies'].value_counts()  

parentspecies
toluene    3379
apin       1195
decane      381
other        45
Name: count, dtype: int64

In [115]:
X_train = train.drop(['log_pSat_Pa'], axis=1)
y_train = train['log_pSat_Pa']
X_test = test

In [116]:
from category_encoders import TargetEncoder

encoder = TargetEncoder()
X_train['parentspecies_encoded'] = encoder.fit_transform(X_train['parentspecies'], y_train)
X_train = X_train.drop(['parentspecies'], axis=1)
X_test['parentspecies_encoded'] = encoder.transform(X_test['parentspecies'])
X_test = X_test.drop(['parentspecies'], axis=1)

In [117]:
X_train

Unnamed: 0,MW,NumOfAtoms,NumOfC,NumOfO,NumOfN,NumHBondDonors,NumOfConf,NumOfConfUsed,C=C (non-aromatic),C=C-C=O in non-aromatic ring,...,ether (alicyclic),nitrate,nitro,aromatic hydroxyl,carbonylperoxynitrate,peroxide,hydroperoxide,carbonylperoxyacid,nitroester,parentspecies_encoded
0,224.016832,23,6,9,0,4,485.0,40.0,0,0,...,1,0,0,0,0,0,2,0,0,-5.485806
1,310.064845,35,9,10,2,1,236.0,40.0,0,0,...,0,2,0,0,0,0,1,0,0,-5.581852
2,368.033938,37,10,13,2,1,308.0,40.0,0,0,...,0,1,0,0,1,0,1,0,0,-5.581852
3,299.012475,29,7,12,1,4,769.0,3.0,0,0,...,0,1,0,0,0,0,1,1,0,-5.485806
4,202.011353,20,7,7,0,1,77.0,32.0,0,0,...,1,0,0,0,0,0,1,0,0,-5.485806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26632,221.017166,22,6,8,1,1,47.0,37.0,0,0,...,1,1,0,0,0,0,1,0,0,-5.485806
26633,222.001182,21,6,9,0,3,323.0,12.0,0,0,...,0,0,0,0,0,1,0,0,0,-5.485806
26634,287.012475,28,6,12,1,4,362.0,11.0,0,0,...,0,1,0,0,0,1,2,0,0,-5.485806
26635,284.996825,26,6,12,1,3,322.0,35.0,0,0,...,0,1,0,0,0,1,1,1,0,-5.485806


In [None]:
#correlation analysis
correlation_matrix = train.corr()

log_psat_correlations = correlation_matrix['log_pSat_Pa'].drop('log_pSat_Pa')

#log_psat_correlations.sort_values(ascending=False))

relevant_features = log_psat_correlations[log_psat_correlations.abs() > 0.1].index
#print(relevant_features)

In [119]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1))

In [122]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV

# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 'auto'],
    'kernel': ['rbf']
}

# Initialize GridSearchCV
grid_search = RandomizedSearchCV(SVR(), param_grid, cv=5, scoring='r2', verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train_scaled.flatten())

# Best parameters and performance
print("Best Parameters:", grid_search.best_params_)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'kernel': 'rbf', 'gamma': 'auto', 'C': 100}


In [123]:
from sklearn.svm import SVR
svr = SVR(C=100, gamma='auto', kernel='rbf')
svr.fit(X_train_scaled, y_train_scaled.flatten())
y_pred_scaled = svr.predict(X_test_scaled)
y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_pred = y_pred.flatten()

In [124]:
df = pd.DataFrame({
    'ID': target_ids,
    'TARGET': y_pred
})

# Save the DataFrame to a CSV file
df.to_csv('predictions.csv', index=False)