In [2]:
import pandas as pd
import scipy.stats

In [3]:
train = pd.read_csv('train.csv')
test1 = pd.read_csv('test1.csv')
test2 = pd.read_csv('test2.csv')

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,1849,26/05/2004,19.00.00,-200,1130.0,-200.0,227,1368.0,-200.0,933.0,-200.0,1709.0,1269.0,267,195,6754,,
1,2533,24/06/2004,07.00.00,12,1030.0,-200.0,69,851.0,102.0,824.0,68.0,1700.0,983.0,219,570,14742,,
2,3047,15/07/2004,17.00.00,32,1164.0,-200.0,203,1306.0,259.0,648.0,198.0,1886.0,1218.0,355,191,10888,,
3,805,13/04/2004,07.00.00,39,1496.0,524.0,191,1272.0,328.0,667.0,130.0,2011.0,1399.0,110,642,8398,,
4,2962,12/07/2004,04.00.00,-200,780.0,-200.0,18,568.0,24.0,1200.0,34.0,1331.0,501.0,199,513,11803,,


In [5]:
train = train[train['NO2(GT)'] > 0]
test1 = test1[test1['NO2(GT)'] > 0]
test2 = test2[test2['NO2(GT)'] > 0]

### Kolmogorov–Smirnov test

In [6]:
ksm1 = scipy.stats.ks_2samp(train['NO2(GT)'], test1['NO2(GT)'])
ksm1

KstestResult(statistic=np.float64(0.017062220028073977), pvalue=np.float64(0.9971378232852736), statistic_location=np.float64(119.0), statistic_sign=np.int8(-1))

In [7]:
ksm2 = scipy.stats.ks_2samp(train['NO2(GT)'], test2['NO2(GT)'])
ksm2

KstestResult(statistic=np.float64(0.3688536442438679), pvalue=np.float64(2.53172387531317e-74), statistic_location=np.float64(130.0), statistic_sign=np.int8(1))

#### Kolmogorov–Smirnov Test Results

**Train vs Test1**

- KS Statistic: 0.01706

- p-value: 0.99713

**Train vs Test2**

- KS Statistic: 0.3688

- p-value: 2.531 × $10^{-74}$


### Inference

#### 1. KS Statistic and p-value Interpretation

- The **KS statistic** measures the maximum difference between the empirical distribution functions of the two samples.
- The **p-value** indicates whether the observed difference is statistically significant.

| Comparison       | KS Statistic | p-value         | Interpretation                                     |
|------------------|--------------|------------------|---------------------------------------------------|
| Train vs Test1   | 0.01706      | 0.9971           | Very small difference; **not statistically significant** |
| Train vs Test2   | 0.3688       | ~0               | Large difference; **highly statistically significant**   |


### 2. Covariate Shift Detection 

- **Covariate shift** occurs when the distribution of input variables changes between the training and test data.
- Based on the KS test:
  - **Test1** does **not** show signs of covariate shift (p-value > 0.05).
  - **Test2** shows **clear evidence of covariate shift** (p-value ≪ 0.05 and a large KS statistic).

#### Conclusion:  
- **Test2 exhibits covariate shift** in the `NO2(GT)` feature relative to the training data.  
- **Test1 does not show covariate shift** and appears to have a similar distribution to the training data.


In [8]:
# do ks stats and p values for all the other features also and report it beatutifully

def ks_test(train, test):
    ks_results = []
    for col in train.columns:
        if col != 'NO2(GT)':
            ks = scipy.stats.ks_2samp(train[col], test[col])
            ks_results.append((col, ks.statistic, ks.pvalue))
    return pd.DataFrame(ks_results, columns=['Feature', 'KS Statistic', 'p-value'])

ks_train_test1 = ks_test(train, test1)
ks_train_test2 = ks_test(train, test2)

ks_train_test1['p-value'] = ks_train_test1['p-value'].apply(lambda x: f'{x:.2e}')
ks_train_test2['p-value'] = ks_train_test2['p-value'].apply(lambda x: f'{x:.2e}')
ks_train_test1['KS Statistic'] = ks_train_test1['KS Statistic'].apply(lambda x: f'{x:.4f}')
ks_train_test2['KS Statistic'] = ks_train_test2['KS Statistic'].apply(lambda x: f'{x:.4f}')

In [11]:
ks_train_test1[:][3:-2]

Unnamed: 0,Feature,KS Statistic,p-value
3,CO(GT),0.0265,0.841
4,PT08.S1(CO),0.0374,0.437
5,NMHC(GT),0.0186,0.991
6,C6H6(GT),0.0479,0.171
7,PT08.S2(NMHC),0.0224,0.947
8,NOx(GT),0.0181,0.994
9,PT08.S3(NOx),0.0401,0.352
10,PT08.S4(NO2),0.0218,0.958
11,PT08.S5(O3),0.0285,0.772
12,T,0.0258,0.861


In [12]:
ks_train_test2[:][3:-2]

Unnamed: 0,Feature,KS Statistic,p-value
3,CO(GT),0.0978,1.6e-05
4,PT08.S1(CO),0.1089,9.74e-07
5,NMHC(GT),0.2612,4.98e-37
6,C6H6(GT),0.058,0.0317
7,PT08.S2(NMHC),0.1677,2.07e-15
8,NOx(GT),0.4878,1.9900000000000002e-132
9,PT08.S3(NOx),0.3093,5.4899999999999995e-52
10,PT08.S4(NO2),0.6005,3.38e-206
11,PT08.S5(O3),0.114,2.37e-07
12,T,0.2793,2.4999999999999997e-42
