In [204]:
import pandas as pd 
import numpy as np 
import seaborn as sns
sns.set(style='white', palette='muted', color_codes=True)
import plotly.express as px 
import io 
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [205]:

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score,confusion_matrix,silhouette_score, adjusted_rand_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans 
from sklearn.compose import make_column_selector as selector

In [206]:
data = pd.read_csv(r'C:\Data-Analysis\Provider\HHS_San_Diego.csv')

In [207]:
data.columns = data.columns.str.replace(' ','')

In [208]:
data

Unnamed: 0,ProviderName,State,City,Payment
0,BRANDON ASTIN DMD LLC,AK,ANCHOR POINT,"$113,026"
1,ELIZABETH WATNEY,AK,ANCHOR POINT,$724
2,HAPPY VALLEY ASSISTED LIVING LLC,AK,ANCHOR POINT,"$12,664"
3,A HAND UP BEHAVIOR SERVICES,AK,ANCHORAGE,"$1,191"
4,A JOINT EFFORT PHYSICAL THERAPY,AK,ANCHORAGE,"$25,480"
...,...,...,...,...
420917,"JEFFRIES DENTAL,PC",WY,WORLAND,"$25,776"
420918,UROLOGICAL SERVICES OF NORTHERN WYOMING PROFES...,WY,WORLAND,"$8,883"
420919,WORLAND PHARMACY INC,WY,WORLAND,$137
420920,WORLAND PHYSICAL THERAPY,WY,WORLAND,"$11,498"


In [209]:
san = ['CA']
new_san = data[data.State.isin(san)]
new_san

Unnamed: 0,ProviderName,State,City,Payment
18087,ARTURO CORDOVA,CA,ACTON,"$1,218"
18088,"ELITE HOME CARE, INC.",CA,ACTON,"$118,535"
18089,"JAMES A. SCHAUBEL, DDS",CA,ACTON,"$12,031"
18090,PHILIP PINTO,CA,ACTON,"$2,120"
18091,SANTIAGO PHYSICAL THERAPY INC,CA,ACTON,"$3,337"
...,...,...,...,...
70148,VALARIE CARPENTER,CA,YUCCA VALLEY,"$1,682"
70149,YANE DAVID LEVY,CA,YUCCA VALLEY,"$56,327"
70150,YOONHO CHANG DDS INC,CA,YUCCA VALLEY,"$80,913"
70151,"YUCCA FAMILY MEDICAL CARE, INC.",CA,YUCCA VALLEY,"$209,926"


In [210]:
san_cou = ['ALPINE', 'BONITA', 'BOULEVARD', 'CAMPO', 'CHULA VISTA', 'DESCANSO', 'DULZURA', 'GUATAY', 'IMPERIAL BEACH', 'JACUMBA', 'JAMUL', 'LA MESA', 'LEMON GROVE', 'LINCOLN ACRES', 'NATIONAL CITY', 'PINE VALLEY', 'POTRERO', 'SPRING VALLEY', 'TECATE', 'BONSALL', 'BORRENGO SPRINGS', 'OCOTILLO WELLS', 'CARDIFF BY THE SEA', 'CARLSBAD', 'DEL MAR', 'EL CAJON', 'ENCINITAS', 'ESCONDIDO', 'FALLBROOK', 'JULIAN', 'LA JOLLA', 'LAKESIDE', 'OCEANSIDE', 'CAMP PENDLETON', 'PALA', 'PALOMAR MOUNTAIN', 'PAUMA VALLEY', 'POWAY', 'RAMONA', 'RANCHITA', 'WARNER SPRINGS', 'RANCHO SANTA FE', 'SAN RUIS REY', 'SAN MARCOS', 'SANTA YSABEL', 'SANTEE', 'SOLANA BEACH', 'SAN MARCOS', 'VISTA', 'VALLEY CENTER', 'FALLBROOK', 'EL CAJON', 'RANCHO SANTA FE', 'SAN MARCOS', 'SAN DIEGO', 'CORONADO', 'SAN YSIDRO']

In [211]:
san_diego_relief = new_san[new_san.City.isin(san_cou)]
san_diego_relief

Unnamed: 0,ProviderName,State,City,Payment
18645,2120 ALPINE BLVD,CA,ALPINE,"$280,570"
18646,A J KLEIN MD A PROFESSIONAL CORPORATION,CA,ALPINE,"$1,401"
18647,"ALPINE PHYSICAL THERAPY & WELLNESS CENTER, INC",CA,ALPINE,"$31,770"
18648,"CYNTHIA L JACKSON, DDS, MS INC.",CA,ALPINE,"$11,724"
18649,"JONAR R. BONIFACIO, DDS, INC.",CA,ALPINE,"$36,217"
...,...,...,...,...
67966,VISTA HOSPICE CARE INC,CA,VISTA,"$151,702"
67967,VISTA PHYSICAL THERAPY AND REHAB PC,CA,VISTA,"$15,557"
67968,VISTA POST ACUTE CENTER LLC,CA,VISTA,"$1,317,998"
67969,"WENDY SHUMATE, MD",CA,VISTA,"$11,756"


In [212]:
df = san_diego_relief.reset_index(inplace=False, drop=True)

In [213]:
df['Payment'] = df['Payment'].str.replace('$', '')
df['Payment'] = df['Payment'].str.replace(',', '')

In [214]:
df['Payment'] = df['Payment'].astype('float64')

In [215]:
df

Unnamed: 0,ProviderName,State,City,Payment
0,2120 ALPINE BLVD,CA,ALPINE,280570.0
1,A J KLEIN MD A PROFESSIONAL CORPORATION,CA,ALPINE,1401.0
2,"ALPINE PHYSICAL THERAPY & WELLNESS CENTER, INC",CA,ALPINE,31770.0
3,"CYNTHIA L JACKSON, DDS, MS INC.",CA,ALPINE,11724.0
4,"JONAR R. BONIFACIO, DDS, INC.",CA,ALPINE,36217.0
...,...,...,...,...
4237,VISTA HOSPICE CARE INC,CA,VISTA,151702.0
4238,VISTA PHYSICAL THERAPY AND REHAB PC,CA,VISTA,15557.0
4239,VISTA POST ACUTE CENTER LLC,CA,VISTA,1317998.0
4240,"WENDY SHUMATE, MD",CA,VISTA,11756.0


In [216]:
df.isna().sum()

ProviderName    0
State           0
City            0
Payment         0
dtype: int64

In [217]:
df.duplicated().sum()

0

In [218]:
X = df.drop('Payment', axis=1)
y = df['Payment']

In [219]:
numeric_feat = df.select_dtypes(include= ['float64']).columns
# numeric_feat

In [220]:
cat_val = df.select_dtypes(exclude= ['float64']).columns
# cat_val

In [221]:
num_transformer = Pipeline ( 
    steps = [('scaler', StandardScaler())]
)

In [222]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [223]:
pre_processor = ColumnTransformer( 
    transformers = [ 
        ('num', num_transformer, numeric_feat),
        ('cat', cat_transformer, cat_val)
    ]
)

In [224]:
classi_fier = Pipeline(
    steps = [('preprocessor', pre_processor), ('classifier', LogisticRegression())]
)

In [225]:
X=df

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [227]:
classi_fier.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print("model score: %.3f" % classi_fier.score(X_test, y_test))

model score: 0.000


In [None]:
# y_pred = LogisticRegression().predict(X_test)