In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
beijing = pd.read_csv('Cities/Beijing_labeled.csv')
chengdu = pd.read_csv('Cities/Chengdu_labeled.csv')
guangzhou = pd.read_csv('Cities/Guangzhou_labeled.csv')
shanghai = pd.read_csv('Cities/Shanghai_labeled.csv')
shenyang = pd.read_csv('Cities/Shenyang_labeled.csv')


print(beijing.shape, chengdu.shape, guangzhou.shape, shanghai.shape, shenyang.shape)

beijing.columns


(2071, 11) (1110, 11) (1352, 11) (1351, 11) (824, 11)


Index(['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation',
       'cbwd_NE', 'cbwd_NW', 'cbwd_SE', 'PM_HIGH'],
      dtype='object')

In [3]:
beijing.head()

Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,4,-8.0,79.0,1026.0,-5.0,23.69,0.0,0,0,1,1.0
1,4,-11.0,85.0,1021.0,-9.0,105.93,1.1,0,0,1,0.0
2,4,-21.0,43.0,1030.0,-11.0,117.55,0.0,0,1,0,0.0
3,4,-25.0,33.0,1034.0,-12.0,39.35,0.0,1,0,0,0.0
4,4,-24.0,30.0,1034.0,-10.0,59.0,0.0,1,0,0,0.0


In [4]:
class PM_HIGH_PREDICTOR:
    def __init__(self, K, tolerance=1e-5):
        self.mean = None
        self.std = None
        self.K = K
        self.tolerance=tolerance
        self.centroids = None
    
    def normalize(self, X, training = False):

        """
        Inputs:
        - X (numpy array): Array containing the input data.
        """

        if training:
            self.mean = np.mean(X, axis = 0)        # Mean over the rows of the input array
            self.std = np.std(X, axis = 0)          # STD over the rows of the input array

        return (X - self.mean)/self.std
        
    def fit(self, X_train, verbose=False):
        
        """
        Inputs:
        - X (np.array): training data
        - verbose: verbosity parameter (set to True to see distance and # of iterations)
        Outputs:
        - None
        """
        
        if len(X_train.shape) == 1:
            raise Exception('You need at least two vectors to train K-Means')
            
        ### INPUT DATA INITIALIZATION ###
        X = self.normalize(X_train, training=True)                                                           
        
        ### RANDOM CENTROID INITIALIZATION ###
        self.centroids = np.random.normal(loc=0, scale= 1, size = (self.K, X.shape[1]))     
        
        centroid_distance = 100
        n_iterations = 0
        
        
        #### ITERATIVE STEP ####
        while centroid_distance > self.tolerance:
            
            # Computing distance between each datapoint and each centroid (returns array of shape n_datapoints x n_centroids)
            point_centroid_distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids[np.newaxis, :, :], axis = 2)
            
            # Assigning each datapoint to closest centroid (returns array of shape n_datapoints x 1, with entries in [0, K])
            centroid_assignment = np.argmin(point_centroid_distances, axis = 1)

            
            new_centroids = []
            for i in range(self.K):
                if np.bincount(centroid_assignment)[i]:
                    
                    # Computing new centroids by doing mean of assigned datapoints
                    new_centroids.append(np.mean(X[centroid_assignment == i], axis = 0))
                
                else:
                    new_centroids.append(self.centroids[i])
            
            new_centroids = np.array(new_centroids)
            centroid_distance = np.linalg.norm(self.centroids - new_centroids)
            self.centroids = new_centroids
            n_iterations += 1
            if verbose:
                print(f'ITERATION {n_iterations}. DISTANCE: {centroid_distance}')
    
    def predict(self, X_test):
        if self.centroids is None:
            raise Exception('The model has not been trained')
        
        if len(X_test.shape) == 1:
            X_test = X_test[np.newaxis, :]
            
        if X_test.shape[1] != self.centroids.shape[1]:
            raise Exception(f'Doing prediction with {X_test.shape[1]} features, but trained on {self.centroids.shape[1]}')
        
        X = self.normalize(X_test, training=False)
        
        point_centroid_distances = np.linalg.norm(X[:, np.newaxis, :] - self.centroids[np.newaxis, :, :], axis = 2)
        return np.argmin(point_centroid_distances, axis = 1)
    
    def score(self, X, y):
        y_pred = self.predict(X)
        n_labels = max(self.centroids.shape[0], int(np.max(y)))
        confusion_matrix = np.zeros((n_labels, n_labels), dtype=int)

        # Counting entries of Confusion Matrix
        for true_label, pred_label in zip(y, y_pred):
            confusion_matrix[int(true_label), int(pred_label)] += 1
            
        # Computing accuracy
        accuracy = np.trace(confusion_matrix) / np.sum(confusion_matrix)
        
        return accuracy, confusion_matrix
    
    

In [None]:
beijing_np = beijing.to_numpy()

temp = PM_HIGH_PREDICTOR(K = 2)

np.max(beijing_np[:, 10])

1.0

In [8]:
temp.fit(beijing_np[:, :10], verbose=True)

ITERATION 1. DISTANCE: 3.2547687757949073
ITERATION 2. DISTANCE: 1.0584827196793352
ITERATION 3. DISTANCE: 0.47365467886217394
ITERATION 4. DISTANCE: 0.1066288223585272
ITERATION 5. DISTANCE: 0.03431855573919686
ITERATION 6. DISTANCE: 0.013284707383571435
ITERATION 7. DISTANCE: 0.003912037657740051
ITERATION 8. DISTANCE: 0.0


In [9]:
beijing_np

array([[  4.,  -8.,  79., ...,   0.,   1.,   1.],
       [  4., -11.,  85., ...,   0.,   1.,   0.],
       [  4., -21.,  43., ...,   1.,   0.,   0.],
       ...,
       [  4.,  -8.,  50., ...,   1.,   0.,   1.],
       [  4., -11.,  28., ...,   1.,   0.,   0.],
       [  4., -10.,  37., ...,   0.,   0.,   0.]])

## class imbalance

In [25]:
classes = (beijing['PM_HIGH'].value_counts() + shenyang['PM_HIGH'].value_counts()).to_dict()
print(classes)

class_ratio = classes[0]/classes[1]
class_ratio

{0.0: 2099, 1.0: 796}


2.636934673366834

In [32]:
## data augmentation

beijing_augmented = beijing.copy()

def augment_dataframe(df):

    if not isinstance(df, pd.DataFrame):
        raise Exception('Input must be a pandas DataFrame')

    df_augmented = df.copy()
    classes = df_augmented['PM_HIGH'].value_counts().to_dict()

    minority_class = 0
    if classes[0] > classes[1]:
        n_samples = int(abs(classes[0] - classes[1]))
        minority_class = 1
    
    df_minority = df_augmented[df_augmented['PM_HIGH'] == minority_class]
    df_newdatapoint = pd.DataFrame(columns=df_augmented.columns)

    for _ in range(n_samples):

        row1, row2 = df_minority.sample(n=2).to_numpy()
        lambd = np.random.rand()
        
        new_datapoint = lambd * row1 + (1 - lambd) * row2
        new_datapoint[-1] = minority_class
        
        df_newdatapoint = pd.concat([df_newdatapoint, pd.Series(new_datapoint, index=df_augmented.columns)], ignore_index=True)


    df_augmented = pd.concat([df_augmented, df_newdatapoint])
    return df_augmented


beijing_augmented = augment_dataframe(beijing)
beijing_augmented


  df_augmented = pd.concat([df_augmented, df_newdatapoint])


Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH,0
0,4,-8.0,79.0,1026.0,-5.0,23.69,0.0,0,0,1,1.0,
1,4,-11.0,85.0,1021.0,-9.0,105.93,1.1,0,0,1,0.0,
2,4,-21.0,43.0,1030.0,-11.0,117.55,0.0,0,1,0,0.0,
3,4,-25.0,33.0,1034.0,-12.0,39.35,0.0,1,0,0,0.0,
4,4,-24.0,30.0,1034.0,-10.0,59.00,0.0,1,0,0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
8498,,,,,,,,,,,,0.000000
8499,,,,,,,,,,,,0.810311
8500,,,,,,,,,,,,0.000000
8501,,,,,,,,,,,,0.189689


In [143]:
def compute_class_balance(df, class_column):
    classes = df[class_column].value_counts().to_dict()
    return classes


def augment_dataframe(df):

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    df_augmented = df.copy()
    classes = df['PM_HIGH'].value_counts()

    minority_class = 0 if classes[0] < classes[1] else 1
    n_samples = int(abs(classes[0] - classes[1]))

    df_minority_np = df[df['PM_HIGH'] == minority_class].to_numpy()

    row_indices = np.random.randint(0, len(df_minority_np), size=(n_samples, 1))
    row_indices_2 = np.zeros((n_samples, 1), dtype=int)

    for i, index in enumerate(row_indices):
        row = df_minority_np[index]
        temp_index = np.where(
                            (df_minority_np[:, 0] == row[:,0]) &  # Column 1: 'season'
                            (df_minority_np[:, 7] == row[:,7]) &  # Column 2: 'cbwd_NE'
                            (df_minority_np[:, 8] == row[:,8]) &  # Column 3: 'cbwd_NW'
                            (df_minority_np[:, 9] == row[:,9])    # Column 4: 'cbwd_SE'
                        )[0]
        
        if len(temp_index) < 2:
            raise ValueError('Not enough similar rows to interpolate')
        
        row_indices_2[i] = np.random.choice(temp_index)        

    lambdas = np.random.rand(n_samples, 1)
    row1 = df_minority_np[row_indices[:, 0]]
    row2 = df_minority_np[row_indices_2[:, 0]]
    new_datapoints = lambdas * row1 + (1 - lambdas) * row2
    new_datapoints[:, -1] = minority_class  

    df_newdatapoint = pd.DataFrame(new_datapoints, columns=df.columns)
    df_augmented = pd.concat([df_augmented, df_newdatapoint], ignore_index=True)

    return df_augmented


In [135]:
beijing_np = beijing.to_numpy()
row = beijing_np[10]
row

array([ 4.000e+00, -2.300e+01,  3.600e+01,  1.029e+03, -1.100e+01,
        4.782e+01,  0.000e+00,  0.000e+00,  1.000e+00,  0.000e+00,
        0.000e+00])

In [137]:
beijing_np

array([[  4.,  -8.,  79., ...,   0.,   1.,   1.],
       [  4., -11.,  85., ...,   0.,   1.,   0.],
       [  4., -21.,  43., ...,   1.,   0.,   0.],
       ...,
       [  4.,  -8.,  50., ...,   1.,   0.,   1.],
       [  4., -11.,  28., ...,   1.,   0.,   0.],
       [  4., -10.,  37., ...,   0.,   0.,   0.]])

In [142]:
beijing_augmented = augment_dataframe(beijing)
beijing_augmented

Unnamed: 0,season,DEWP,HUMI,PRES,TEMP,Iws,precipitation,cbwd_NE,cbwd_NW,cbwd_SE,PM_HIGH
0,4.0,-8.000000,79.000000,1026.000000,-5.000000,23.690000,0.000000,0.0,0.0,1.0,1.0
1,4.0,-11.000000,85.000000,1021.000000,-9.000000,105.930000,1.100000,0.0,0.0,1.0,0.0
2,4.0,-21.000000,43.000000,1030.000000,-11.000000,117.550000,0.000000,0.0,1.0,0.0,0.0
3,4.0,-25.000000,33.000000,1034.000000,-12.000000,39.350000,0.000000,1.0,0.0,0.0,0.0
4,4.0,-24.000000,30.000000,1034.000000,-10.000000,59.000000,0.000000,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2839,3.0,5.795019,75.329208,1020.124226,9.962717,1.152732,0.000000,0.0,0.0,0.0,1.0
2840,3.0,7.758356,59.775707,1022.827763,15.361826,2.321054,0.000000,0.0,0.0,0.0,1.0
2841,2.0,21.428666,80.618890,1003.476222,24.904888,20.226525,0.209511,0.0,0.0,1.0,1.0
2842,3.0,-0.560767,83.121535,1024.720682,2.158849,2.040267,0.000000,0.0,0.0,1.0,1.0


In [129]:
print(compute_class_balance(beijing, 'PM_HIGH'))
print(compute_class_balance(beijing_augmented, 'PM_HIGH'))

{0.0: 1422, 1.0: 649}
{1.0: 1422, 0.0: 1422}


In [47]:
beijing.columns

Index(['season', 'DEWP', 'HUMI', 'PRES', 'TEMP', 'Iws', 'precipitation',
       'cbwd_NE', 'cbwd_NW', 'cbwd_SE', 'PM_HIGH'],
      dtype='object')

In [55]:
beijing['cbwd_SE'].value_counts()

cbwd_SE
0    1417
1     654
Name: count, dtype: int64