# Importing packages and dataset

In [136]:
#T-test
import pandas as pd
import scipy.stats as stats

In [137]:
df=pd.read_csv('group_data.csv')

# T-test Setup

##### Isolating columns and defining them as variables

In [138]:
number_meds=df.iloc[:,12]
race_eth=df.iloc[:,0]
age=df.iloc[:,2]
LOS=df.iloc[:,7]
readmissions=df.iloc[:,22]
discharge_disposition=df.iloc[:,4]

##### Replacing strings in each column with representative variables

In [139]:
#race/ethnicity
race_eth2=race_eth.replace({
    "African American":int(1),
    "Asian":int(2),
    "Caucasian":int(3),
    "Hispanic":int(4),
    "Null":int(5),
    "Other":int(6),
})

In [140]:
#age
age2=age.replace({
    "[0-10)":int(1),
    "[10-20)":int(2),
    "[20-30)":int(3),
    "[30-40)":int(4),
    "[40-50)":int(5),
    "[50-60)":int(6),
    "[60-70)":int(7),
    "[70-80)":int(8),
    "[80-90)":int(9),
    "[90-100)":int(10)
})

In [141]:
#readmissions
readmissions2=readmissions.replace({
    ">30":int(1),
    "<30":int(2),
    "No":int(3)
})

In [142]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 23 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ethnicity/race            101766 non-null  object 
 1   sex                       101766 non-null  object 
 2   age                       101766 non-null  object 
 3   admission_type_id         101766 non-null  int64  
 4   discharge_disposition_id  101766 non-null  int64  
 5   admission_source_id       101766 non-null  int64  
 6   Unnamed: 6                0 non-null       float64
 7   LOS                       101766 non-null  int64  
 8   payer_code                101766 non-null  object 
 9   medical_specialty         101766 non-null  object 
 10  num_lab_procedures        101766 non-null  int64  
 11  num_procedures            101766 non-null  int64  
 12  num_medications           101766 non-null  int64  
 13  number_outpatient         101766 non-null  i

# Running the t-tests

##### T-tests help determine whether there is a significant difference between the groups or if the groups are similar. Through this we can interpret if the two columns are correleated. The null hypothesis in the case of this dataset is that there is no difference or correlation between the two groups. For this dataset the p-value came out to zero for all of the value comparisons, so we reject the null hypothesis, concluding that there is significant differences and no correlation for each of the variables.

### Variable 1: Number of medications

#### As compared to ethnicity/race

In [143]:
stats.ttest_ind(number_meds,race_eth2)

Ttest_indResult(statistic=518.2632769123827, pvalue=0.0)

#### As compared to age

In [144]:
stats.ttest_ind(number_meds,age2)

Ttest_indResult(statistic=343.7631978806828, pvalue=0.0)

### Variable 2: Length of stay

#### As compared to ethnicity/race

In [145]:
stats.ttest_ind(LOS,race_eth2)

Ttest_indResult(statistic=169.91304837415873, pvalue=0.0)

#### As compared to age

In [146]:
stats.ttest_ind(LOS,age2)

Ttest_indResult(statistic=-254.58923700359645, pvalue=0.0)

### Variable 3: Readmission Rates

#### As compared to ethnicity/race

In [147]:
stats.ttest_ind(readmissions2,race_eth2)

Ttest_indResult(statistic=-127.99217334121776, pvalue=0.0)

#### As compared to age

In [148]:
stats.ttest_ind(readmissions2,age2)

Ttest_indResult(statistic=-849.7343551871595, pvalue=0.0)

#### As compared to discharge disposition

In [149]:
stats.ttest_ind(readmissions2,discharge_disposition)

Ttest_indResult(statistic=-90.80574716134892, pvalue=0.0)

# Linear Regression

In [194]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression

### Variable 1: Number of medications

#### As compared to race/ethnicity

In [195]:
X=np.array([number_meds]).reshape((-1,1))
y=np.array([race_eth2]).reshape((-1,1))

In [196]:
model = LinearRegression().fit(X, y)

In [197]:
r_sq = model.score(X, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.0005132333878352657


In [198]:
print('intercept:', model.intercept_)

intercept: [2.68222071]


In [199]:
print('slope:', model.coef_)

slope: [[0.00268579]]


In [200]:
y_pred = model.predict(X)
print('predicted response:', y_pred, sep='\n')

predicted response:
[[2.72787917]
 [2.74667972]
 [2.70639284]
 ...
 [2.70639284]
 [2.73056496]
 [2.69296388]]


#### As compared to age

In [201]:
y2=np.array([age2]).reshape((-1,1))

In [202]:
model2 = LinearRegression().fit(X, y2)

In [203]:
r_sq2 = model2.score(X, y2)
print('coefficient of determination:', r_sq2)

coefficient of determination: 0.0017511737644607672


In [204]:
print('intercept:', model2.intercept_)

intercept: [6.96520163]


In [205]:
print('slope:', model2.coef_)

slope: [[0.00820758]]


In [206]:
y2_pred = model2.predict(X)
print('predicted response:', y2_pred, sep='\n')

predicted response:
[[7.10473053]
 [7.16218361]
 [7.03906987]
 ...
 [7.03906987]
 [7.11293811]
 [6.99803196]]


### Variable 2: Length of Stay

#### As compared to race/ethnicity

In [210]:
X2=np.array([LOS]).reshape((-1,1))

In [211]:
model3 = LinearRegression().fit(X2, y)

In [212]:
r_sq3 = model3.score(X2, y)
print('coefficient of determination:', r_sq3)

coefficient of determination: 0.0003876686533235141


In [213]:
print('intercept:', model3.intercept_)

intercept: [2.75319049]


In [214]:
print('slope:', model3.coef_)

slope: [[-0.00635544]]


In [216]:
y3_pred = model3.predict(X2)
print('predicted response:', y3_pred, sep='\n')

predicted response:
[[2.72776872]
 [2.74683505]
 [2.72776872]
 ...
 [2.72776872]
 [2.74683505]
 [2.73412416]]


#### As compared to age

In [217]:
model4 = LinearRegression().fit(X2, y2)

In [218]:
r_sq4 = model4.score(X2, y2)
print('coefficient of determination:', r_sq4)

coefficient of determination: 0.011559457021110608


In [219]:
print('intercept:', model4.intercept_)

intercept: [6.84430986]


In [220]:
print('slope:', model4.coef_)

slope: [[0.05741427]]


In [221]:
y4_pred = model4.predict(X2)
print('predicted response:', y4_pred, sep='\n')

predicted response:
[[7.07396694]
 [6.90172413]
 [7.07396694]
 ...
 [7.07396694]
 [6.90172413]
 [7.01655267]]


### Variable 3: Readmissions

#### As compared to race/ethnicity

In [222]:
X3=np.array([readmissions2]).reshape((-1,1))

In [223]:
model5 = LinearRegression().fit(X3, y)

In [224]:
r_sq5 = model5.score(X3, y)
print('coefficient of determination:', r_sq5)

coefficient of determination: 0.00025507829840432095


In [225]:
print('intercept:', model5.intercept_)

intercept: [2.68875063]


In [226]:
print('slope:', model5.coef_)

slope: [[0.01666855]]


In [227]:
y5_pred = model5.predict(X3)
print('predicted response:', y5_pred, sep='\n')

predicted response:
[[2.70541918]
 [2.73875628]
 [2.70541918]
 ...
 [2.73875628]
 [2.73875628]
 [2.70541918]]


#### As compared to age

In [228]:
model6 = LinearRegression().fit(X3, y2)

In [229]:
r_sq6 = model6.score(X3, y2)
print('coefficient of determination:', r_sq6)

coefficient of determination: 0.0007190859847027165


In [230]:
print('intercept:', model6.intercept_)
print('slope:', model6.coef_)

intercept: [7.19809317]
slope: [[-0.04630067]]


In [231]:
y6_pred = model6.predict(X3)
print('predicted response:', y6_pred, sep='\n')

predicted response:
[[7.1517925 ]
 [7.05919117]
 [7.1517925 ]
 ...
 [7.05919117]
 [7.05919117]
 [7.1517925 ]]


#### As compared to discharge disposition

In [232]:
y3=np.array([discharge_disposition]).reshape((-1,1))

In [233]:
model7 = LinearRegression().fit(X3, y3)

In [234]:
r_sq7 = model6.score(X3, y3)
print('coefficient of determination:', r_sq7)

coefficient of determination: -0.41011004132757023


In [235]:
print('intercept:', model7.intercept_)
print('slope:', model7.coef_)

intercept: [3.70401577]
slope: [[0.00530907]]


In [236]:
y7_pred = model7.predict(X3)
print('predicted response:', y7_pred, sep='\n')

predicted response:
[[3.70932484]
 [3.71994297]
 [3.70932484]
 ...
 [3.71994297]
 [3.71994297]
 [3.70932484]]
