In [277]:
import pandas as pd

In [278]:
f = open('C:/Users/17172/Desktop/iRCT/iRCT/Output_files/output_outcomeEDVisit_TreatmentCOVID.txt', 'w')

outcomeCol = 'Dyspnea'
treatmentCol = 'COVID'
excludedColumns = []
df = pd.read_csv("C:/Users/17172/Desktop/iRCT/datasets/COVID3_4Nodes3.dat")
df.index = range(1, len(df)+1, 1)
df = df.replace(to_replace='No', value=0)
df = df.replace(to_replace='Yes', value=1)
df = df.replace(to_replace='Negtive', value=0)
df = df.replace(to_replace='Positive', value=1)
df

Unnamed: 0,ED_Visit,Dyspnea,COPD,COVID
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,1
5,0,0,0,0
...,...,...,...,...
49996,0,1,0,1
49997,0,0,0,0
49998,0,0,0,0
49999,0,0,0,0


In [279]:
import pandas as pd
import numpy as np
import math
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import time

from sklearn.linear_model import LogisticRegression as lr

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

class iRCT:
    def __init__(self, dataframe, treatmentCol, outcomeCol):
        self.df = dataframe
        self.treatmentCol = treatmentCol
        self.covariateCol = 'propensityScoreLogit'
        self.indexCol = self.df.index
        self.outcomeCol = outcomeCol
        self.relationVal, self.timeToComplete = self.calculateRelationVal()

    def calculateRelationVal(self):

        '''
        This function is based on this notebook: https://github.com/konosp/propensity-score-matching/blob/main/propensity_score_matching_v2.ipynb
        '''

        start = time.time()

        # Creates matches column for matching estimators
        emptyVal = [0] * self.df.index
        self.df.insert(len(self.df.columns), 'matches', emptyVal)

        self.df, X = self.generatePropensityScores()

        knn = NearestNeighbors(n_neighbors=5, p=2)
        knn.fit(X[['propensityScoreLogit']].to_numpy())

        distances, indexes = knn.kneighbors(X[['propensityScoreLogit']].to_numpy(), n_neighbors=5)

        def matching(row, indexes, X):
            current_index = int(row['index'])-1
            prop_score_logit = row['propensityScoreLogit']
            for idx in indexes[current_index,:]:
                if (current_index != idx) and (row.treatment == 1) and (X.loc[idx].treatment == 0):
                    return int(idx)

        X['match'] = X.reset_index().apply(matching, axis = 1, args = (indexes, X))
        treated_with_match = ~X.match.isna()
        treated_matched_data = X[treated_with_match][X.columns]
        
        untreated_matched_data = pd.DataFrame(data = treated_matched_data.match)
        untreated_matched_data = untreated_matched_data.set_index('match')

        all_matched_data = pd.concat([treated_matched_data, untreated_matched_data])
        
        overview = all_matched_data[['outcome','treatment']].groupby(by = ['treatment']).aggregate([np.mean, np.var, np.std, 'count'])
        
        treated_outcome = overview['outcome']['mean'][1]
        treated_counterfactual_outcome = overview['outcome']['mean'][0]

        return treated_outcome - treated_counterfactual_outcome, time.time()-start


    def generatePropensityScores(self):
        '''
        :param self: the instance of the iRCT class
        Returns the new dataset with the propensity_score and propensity_score_logit columns
        '''

        #Define the treatment and outcome columns
        y = self.df[[self.outcomeCol]]
        dfWithoutOutcome = self.df.drop(columns=[self.outcomeCol])
        T = dfWithoutOutcome[self.treatmentCol]

        #Define X or the dataframe for all covariates and fit to a logistical regression model
        X = dfWithoutOutcome.loc[:, dfWithoutOutcome.columns != self.treatmentCol]
        pipe = Pipeline([('scaler', StandardScaler()), ('logistic_classifier', lr())])
        pipe.fit(X, T)

        #Generate the propensity scores
        predictions = pipe.predict_proba(X)
        predictions_binary = pipe.predict(X)

        #Generate the propensity score logit
        predictions_logit = np.array([logit(xi) for xi in predictions[:,1]])

        #Add both propensity_score, propensity_score_logit, and outcome columns to dataframe 
        dfWithoutOutcome.loc[:, 'propensityScore'] = predictions[:,1]
        dfWithoutOutcome.loc[:, 'propensityScoreLogit'] = predictions_logit
        dfWithoutOutcome.loc[:, 'outcome'] = y[self.outcomeCol]

        X.loc[:, 'propensityScore'] = predictions[:,1]
        X.loc[:, 'propensityScoreLogit'] = predictions_logit
        X.loc[:, 'outcome'] = y[self.outcomeCol]
        X.loc[:, 'treatment'] = dfWithoutOutcome[self.treatmentCol]
        return dfWithoutOutcome, X

def logit(p):
    logit_value = math.log(p / (1-p))
    return logit_value


In [280]:
myiRCT = iRCT(df, treatmentCol, outcomeCol)
print(myiRCT.relationVal)
print(myiRCT.timeToComplete)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'propensityScore'] = predictions[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'propensityScoreLogit'] = predictions_logit
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'outcome'] = y[self.outcomeCol]


0.12469651283903462
22.15751051902771
