In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName('2.1. Google Cloud Storage (CSV) & Spark DataFrames') \
  .getOrCreate()

In [3]:
PartDRawData = "shreneel-bigdata1/Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv"


In [4]:

from google.cloud import storage

gcs_client = storage.Client()
bucket = gcs_client.bucket('shreneel-bigdata1')

list(bucket.list_blobs(prefix='Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv'))

[<Blob: shreneel-bigdata1, Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv, 1683164667126860>]

In [None]:
#listing all documents in root folder

In [5]:
!hdfs dfs -ls 'gs://shreneel-bigdata1/Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv'

-rwx------   3 root root 3580761897 2023-05-04 01:44 gs://shreneel-bigdata1/Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv


In [None]:
#creating a spark dataframe for dataset 1

In [6]:
partD_drug_rawdata = spark \
  .read \
  .option ( "inferSchema" , "true" ) \
  .option ( "header" , "true" ) \
  .csv ( "gs://shreneel-bigdata1/Medicare_Part_D_Prescribers_by_Provider_and_Drug_Dataset_2016.csv" )

partD_drug_rawdata.printSchema()

root
 |-- Prscrbr_NPI: integer (nullable = true)
 |-- Prscrbr_Last_Org_Name: string (nullable = true)
 |-- Prscrbr_First_Name: string (nullable = true)
 |-- Prscrbr_City: string (nullable = true)
 |-- Prscrbr_State_Abrvtn: string (nullable = true)
 |-- Prscrbr_State_FIPS: string (nullable = true)
 |-- Prscrbr_Type: string (nullable = true)
 |-- Prscrbr_Type_Src: string (nullable = true)
 |-- Brnd_Name: string (nullable = true)
 |-- Gnrc_Name: string (nullable = true)
 |-- Tot_Clms: integer (nullable = true)
 |-- Tot_30day_Fills: double (nullable = true)
 |-- Tot_Day_Suply: integer (nullable = true)
 |-- Tot_Drug_Cst: double (nullable = true)
 |-- Tot_Benes: integer (nullable = true)
 |-- GE65_Sprsn_Flag: string (nullable = true)
 |-- GE65_Tot_Clms: integer (nullable = true)
 |-- GE65_Tot_30day_Fills: double (nullable = true)
 |-- GE65_Tot_Drug_Cst: double (nullable = true)
 |-- GE65_Tot_Day_Suply: integer (nullable = true)
 |-- GE65_Bene_Sprsn_Flag: string (nullable = true)
 |-- GE65_T

In [7]:
df1=partD_drug_rawdata

In [8]:
df1

DataFrame[Prscrbr_NPI: int, Prscrbr_Last_Org_Name: string, Prscrbr_First_Name: string, Prscrbr_City: string, Prscrbr_State_Abrvtn: string, Prscrbr_State_FIPS: string, Prscrbr_Type: string, Prscrbr_Type_Src: string, Brnd_Name: string, Gnrc_Name: string, Tot_Clms: int, Tot_30day_Fills: double, Tot_Day_Suply: int, Tot_Drug_Cst: double, Tot_Benes: int, GE65_Sprsn_Flag: string, GE65_Tot_Clms: int, GE65_Tot_30day_Fills: double, GE65_Tot_Drug_Cst: double, GE65_Tot_Day_Suply: int, GE65_Bene_Sprsn_Flag: string, GE65_Tot_Benes: int]

In [None]:
#Total number of rows and columns in the first dataset

In [9]:
num_rows = df1.count()
num_cols = len(df1.columns)
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

Number of rows:  24964300
Number of columns:  22


In [None]:
#Selecting relevant features from the dataset

In [10]:

from pyspark.sql.functions import col

partD_Drug_pd1 = df1.select(col("Prscrbr_NPI"), col("Prscrbr_City"), col("Prscrbr_State_Abrvtn"), \
                            col("Prscrbr_Last_Org_Name"), col("Prscrbr_First_Name"), \
                            col("Prscrbr_Type"), col("Brnd_Name"), col("Gnrc_Name"), \
                            col("Tot_Drug_Cst"), col("Tot_Clms"), col("Tot_Day_Suply"))

In [11]:
partD_pd1 = partD_Drug_pd1

In [12]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType



# Select the required columns
partD_Drug_df = partD_pd1.select(col('Prscrbr_NPI'), col('Brnd_Name'), col('Tot_Drug_Cst'), col('Tot_Clms'), col('Tot_Day_Suply'), col('Prscrbr_Type'))

# Cast the 'npi' column to 'StringType'
partD_Drug_df = partD_Drug_df.withColumn('Prscrbr_NPI', col('Prscrbr_NPI').cast(StringType()))

# Show the resulting DataFrame
partD_Drug_df.show()


+-----------+--------------------+------------+--------+-------------+-----------------+
|Prscrbr_NPI|           Brnd_Name|Tot_Drug_Cst|Tot_Clms|Tot_Day_Suply|     Prscrbr_Type|
+-----------+--------------------+------------+--------+-------------+-----------------+
| 1003000126|Atorvastatin Calcium|      139.32|      13|          450|Internal Medicine|
| 1003000126|   Ciprofloxacin Hcl|       80.99|      11|           96|Internal Medicine|
| 1003000126| Doxycycline Hyclate|      586.12|      20|          199|Internal Medicine|
| 1003000126|             Eliquis|     6065.02|      17|          510|Internal Medicine|
| 1003000126|          Furosemide|       45.76|      17|          405|Internal Medicine|
| 1003000126|     Hydralazine Hcl|      169.48|      16|          420|Internal Medicine|
| 1003000126|Isosorbide Mononi...|      372.63|      33|         1005|Internal Medicine|
| 1003000126|        Levofloxacin|      222.41|      26|          159|Internal Medicine|
| 1003000126|        

In [13]:
# Select the required columns
partD_Spec_pd1 = partD_pd1.select(col('Prscrbr_NPI'), col('Prscrbr_Type'))
# Show the resulting DataFrame
partD_Spec_pd1.show()

+-----------+-----------------+
|Prscrbr_NPI|     Prscrbr_Type|
+-----------+-----------------+
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000126|Internal Medicine|
| 1003000142|   Anesthesiology|
| 1003000142|   Anesthesiology|
| 1003000142|   Anesthesiology|
| 1003000142|   Anesthesiology|
| 1003000142|   Anesthesiology|
+-----------+-----------------+
only showing top 20 rows



In [14]:
partD_Drug_df.head()

Row(Prscrbr_NPI='1003000126', Brnd_Name='Atorvastatin Calcium', Tot_Drug_Cst=139.32, Tot_Clms=13, Tot_Day_Suply=450, Prscrbr_Type='Internal Medicine')

In [15]:

# Select the required columns
partD_pd0= partD_pd1.select(col('Prscrbr_NPI'), col('Prscrbr_City'), col('Prscrbr_State_Abrvtn'), \
                               col('Prscrbr_Last_Org_Name'), col('Prscrbr_First_Name'), \
                               col('Prscrbr_Type'))

# Show the resulting DataFrame
partD_pd0.show()


+-----------+------------+--------------------+---------------------+------------------+-----------------+
|Prscrbr_NPI|Prscrbr_City|Prscrbr_State_Abrvtn|Prscrbr_Last_Org_Name|Prscrbr_First_Name|     Prscrbr_Type|
+-----------+------------+--------------------+---------------------+------------------+-----------------+
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|                  MD|            Enkeshafi|           Ardalan|Internal Medicine|
| 1003000126|  Cumberland|           

In [None]:
#Remove duplicate records

In [16]:
partD_catfpd = partD_pd0.drop_duplicates()

In [17]:
partD_catfpd.head()

Row(Prscrbr_NPI=1003012014, Prscrbr_City='Providence', Prscrbr_State_Abrvtn='RI', Prscrbr_Last_Org_Name='Scarfo', Prscrbr_First_Name='Keith-Austin', Prscrbr_Type='Anesthesiology')

In [18]:

# Define the rename dictionary
rename_dict = {'Prscrbr_First_Name':'first_name', 'Prscrbr_Last_Org_Name':'last_name', \
               'Prscrbr_City':'city', 'Prscrbr_State_Abrvtn':'state', 'Prscrbr_Type':'Speciality'}



# Rename the columns
for old_col, new_col in rename_dict.items():
    partD_catfpd = partD_catfpd.withColumnRenamed(old_col, new_col)

# Show the resulting DataFrame
partD_catfpd.show()

+-----------+----------------+-----+-------------------+------------+--------------------+
|Prscrbr_NPI|            city|state|          last_name|  first_name|          Speciality|
+-----------+----------------+-----+-------------------+------------+--------------------+
| 1003012014|      Providence|   RI|             Scarfo|Keith-Austin|      Anesthesiology|
| 1003015785|         Houston|   TX|             Rogers|        Ryan|       Ophthalmology|
| 1003019902|       Nashville|   TN|             Acosta|     Lealani|           Neurology|
| 1003051715|      Logansport|   IN|          Abi Fadel|    Francois|   Pulmonary Disease|
| 1003055781|        Hartford|   CT|               Rice|       Jenny| Physician Assistant|
| 1003068628|       Evergreen|   CO|              Smith|       Katie| Physician Assistant|
| 1003078254|    Wilkes Barre|   PA|          Rutkowski|        Paul|   Internal Medicine|
| 1003081142|          Oxnard|   CA|             Murphy|        Neal|             Dentist|

In [19]:
partD_catfpd.count()

893160

In [20]:
from pyspark.sql.functions import sum, mean, max

partD_agg=partD_Drug_df.groupBy('Prscrbr_NPI').agg(sum('Tot_Drug_Cst').alias('sum_tot_drug_cst'),
                              mean('Tot_Drug_Cst').alias('avg_tot_drug_cst'),
                              max('Tot_Drug_Cst').alias('max_tot_drug_cst'),
                              sum('Tot_Clms').alias('sum_tot_clms'),
                              mean('Tot_Clms').alias('avg_tot_clms'),
                              max('Tot_Clms').alias('max_tot_clms'),
                              sum('Tot_Day_Suply').alias('sum_tot_day_suply'),
                              mean('Tot_Day_Suply').alias('avg_tot_day_suply'),
                              max('Tot_Day_Suply').alias('max_tot_day_suply'))


In [21]:
partD_agg.show()

+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+
|Prscrbr_NPI|  sum_tot_drug_cst|  avg_tot_drug_cst|max_tot_drug_cst|sum_tot_clms|      avg_tot_clms|max_tot_clms|sum_tot_day_suply| avg_tot_day_suply|max_tot_day_suply|
+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+
| 1033388616|         418520.54|        16740.8216|       105879.23|        1135|              45.4|         217|            44265|            1770.6|             9639|
| 1033392337|            703.09|          175.7725|          298.34|         119|             29.75|          62|              747|            186.75|              439|
| 1033394812|            701.25|           116.875|          204.95|         149|24.833333333333332|          56|              580| 96.66666666666667|     

In [22]:
from pyspark.sql.functions import col

partD_allpd = partD_agg.join(partD_catfpd, on='Prscrbr_NPI', how='left')



In [23]:
partD_allpd.show()

+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+------------+-----+-------------+----------+-------------------+
|Prscrbr_NPI|  sum_tot_drug_cst|  avg_tot_drug_cst|max_tot_drug_cst|sum_tot_clms|      avg_tot_clms|max_tot_clms|sum_tot_day_suply| avg_tot_day_suply|max_tot_day_suply|        city|state|    last_name|first_name|         Speciality|
+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+------------+-----+-------------+----------+-------------------+
| 1003043209|          39676.76|1202.3260606060608|         9329.51|        1170| 35.45454545454545|         109|            44482| 1347.939393939394|             5262|        Lynn|   MA|        Affel|  Marjorie|    Family Practice|
| 1003072810|214723.17999999996|2618.5753658536582|        21064.12|

In [24]:
partD_allpd.head()

Row(Prscrbr_NPI='1003043209', sum_tot_drug_cst=39676.76, avg_tot_drug_cst=1202.3260606060608, max_tot_drug_cst=9329.51, sum_tot_clms=1170, avg_tot_clms=35.45454545454545, max_tot_clms=109, sum_tot_day_suply=44482, avg_tot_day_suply=1347.939393939394, max_tot_day_suply=5262, city='Lynn', state='MA', last_name='Affel', first_name='Marjorie', Speciality='Family Practice')

In [None]:
#creating a spark dataframe for dataset 2

In [25]:
gcs_client = storage.Client()
bucket = gcs_client.bucket('shreneel-bigdata1')

list(bucket.list_blobs(prefix='OP_DTL_GNRL_PGYR2015_P01202023.csv'))
!hdfs dfs -ls 'gs://shreneel-bigdata1/OP_DTL_GNRL_PGYR2015_P01202023.csv'
payment_rawdata = spark \
  .read \
  .option ( "inferSchema" , "true" ) \
  .option ( "header" , "true" ) \
  .csv ( "gs://shreneel-bigdata1/OP_DTL_GNRL_PGYR2015_P01202023.csv" )

payment_rawdata.printSchema()

-rwx------   3 root root 6409783272 2023-05-04 01:50 gs://shreneel-bigdata1/OP_DTL_GNRL_PGYR2015_P01202023.csv
root
 |-- Change_Type: string (nullable = true)
 |-- Covered_Recipient_Type: string (nullable = true)
 |-- Teaching_Hospital_CCN: integer (nullable = true)
 |-- Teaching_Hospital_ID: integer (nullable = true)
 |-- Teaching_Hospital_Name: string (nullable = true)
 |-- Physician_Profile_ID: integer (nullable = true)
 |-- Physician_NPI: integer (nullable = true)
 |-- Physician_First_Name: string (nullable = true)
 |-- Physician_Middle_Name: string (nullable = true)
 |-- Physician_Last_Name: string (nullable = true)
 |-- Physician_Name_Suffix: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line1: string (nullable = true)
 |-- Recipient_Primary_Business_Street_Address_Line2: string (nullable = true)
 |-- Recipient_City: string (nullable = true)
 |-- Recipient_State: string (nullable = true)
 |-- Recipient_Zip_Code: string (nullable = true)
 |-- Recipient_Co

In [26]:
from pyspark.sql.functions import col

payment_fpd = payment_rawdata.select(col('Physician_First_Name'),
                                 col('Physician_Last_Name'),
                                 col('Recipient_City'),
                                 col('Recipient_State'),
                                 col('Total_Amount_of_Payment_USDollars'))


In [27]:
payment_fpd.head()

Row(Physician_First_Name='DAVID', Physician_Last_Name='GORDLEY', Recipient_City='SLIPPERY ROCK', Recipient_State='PA', Total_Amount_of_Payment_USDollars='60.00')

In [28]:
payment_fpd.count()

11572091

In [29]:
from pyspark.sql.functions import sum

payment_fpd1 = payment_fpd.groupby(['Physician_First_Name', 'Physician_Last_Name', 'Recipient_City', 'Recipient_State']) \
                         .agg(sum('Total_Amount_of_Payment_USDollars').alias('Total_Amount_of_Payment_USDollars_sum'))

In [30]:
payment_fpd1.show()

+--------------------+-------------------+-----------------+---------------+-------------------------------------+
|Physician_First_Name|Physician_Last_Name|   Recipient_City|Recipient_State|Total_Amount_of_Payment_USDollars_sum|
+--------------------+-------------------+-----------------+---------------+-------------------------------------+
|               SCOTT|               RAND|          HOUSTON|             TX|                               295.88|
|               SHAWN|           BRUBAKER|          REDDING|             CA|                   21901.230000000003|
|               SHYAM|        BRAHMABHATT|     WILLOW GROVE|             PA|                               148.49|
|              STACEY|             MARTIN|          HOUSTON|             TX|                              9793.95|
|             STEPHEN|           KOWALSKI|       BIRMINGHAM|             AL|                               771.17|
|              SZYMON|         ROSENBLATT|ELK GROVE VILLAGE|             IL|    

In [31]:
from pyspark.sql.functions import col

rename_dict = {'Physician_First_Name':'first_name', 'Physician_Last_Name':'last_name', 'Recipient_City':'city', 'Recipient_State':'state', 'Total_Amount_of_Payment_USDollars_sum':'Total_Payment_Sum'}

payment_fpd1 = payment_fpd1.select([col(c).alias(rename_dict.get(c, c)) for c in payment_fpd1.columns])


In [32]:
payment_fpd1.show()

+----------+---------+-------------+-----+------------------+
|first_name|last_name|         city|state| Total_Payment_Sum|
+----------+---------+-------------+-----+------------------+
|    Edward|  Alquero|      Waipahu|   HI|428.01000000000005|
|  Jonathan| Eckstein|Valley Stream|   NY|153.45000000000002|
|   Douglas|  Corazza|    Princeton|   NJ|            324.19|
|     David|    White|   Whiteville|   NC|            131.12|
|   Krystal|  Ainsley| Williamsburg|   VA|23.630000000000003|
|    Willis|    Godin|  Cherry Hill|   NJ|105.49000000000001|
|      Anna|Naumovich|     St Louis|   MO|1348.5900000000001|
|     David| Portugal|      Houston|   TX|            141.07|
|   Mangala|   Shetty|        Ocala|   FL|             191.5|
|     Diego|     Diaz|     New York|   NY|            380.01|
|    Joseph| Valaitis|     Rockford|   IL|              77.4|
|    Sharon|   Packer|     New York|   NY|              25.0|
| Fionnuala|    Kelly|     Hinsdale|   IL|            106.02|
|     My

In [33]:
from pyspark.sql.functions import desc

payment_fpd2 = payment_fpd1.sort(desc('Total_Payment_Sum'))

In [34]:
payment_fpd2.show()

+----------+---------+-----------------+-----+--------------------+
|first_name|last_name|             city|state|   Total_Payment_Sum|
+----------+---------+-----------------+-----+--------------------+
|      null|     null|           DUARTE|   CA|      3.0654182569E8|
|      null|     null|           BOSTON|   MA|       4.200212036E7|
|     ROGER|  JACKSON|NORTH KANSAS CITY|   MO|       3.450708545E7|
|      null|     null|           Boston|   MA|2.0766683030000005E7|
|   STEPHEN| BURKHART|      SAN ANTONIO|   TX|1.9421951320000004E7|
|      null|     null|        Rochester|   NY|        1.93059828E7|
|     KEVIN|    FOLEY|          Memphis|   TN| 1.782763144999999E7|
|      null|     null|        Cleveland|   OH|1.4486272639999999E7|
|      YVES|    GOBIN|         New York|   NY|1.2962521479999999E7|
|      null|     null|            PHILA|   PA|1.1572908839999998E7|
|    RODNEY|    RAABE|          Spokane|   WA|1.0414841879999999E7|
|      null|     null|      Little Rock|   AR|1.

In [None]:
# Joining the 

In [35]:
pay_partD_fpd = partD_allpd.join(payment_fpd2, ['last_name', 'first_name', 'city', 'state'], how='left')


In [36]:
pay_partD_fpd.show()

+---------+----------+----------------+-----+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+--------------------+------------------+
|last_name|first_name|            city|state|Prscrbr_NPI|  sum_tot_drug_cst|  avg_tot_drug_cst|max_tot_drug_cst|sum_tot_clms|      avg_tot_clms|max_tot_clms|sum_tot_day_suply| avg_tot_day_suply|max_tot_day_suply|          Speciality| Total_Payment_Sum|
+---------+----------+----------------+-----+-----------+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+--------------------+------------------+
|   Abbott|     Laura|   San Francisco|   CA| 1093071367|          16670.63| 5556.876666666667|         9978.51|          65|21.666666666666668|          35|             6923|2307.6666666666665|             5003|Student in an Org...|        

In [None]:
#Load dataset 3 in pyspark dataframe
#This database contains a rundown of people and substances that are prohibited from taking an interest in governmentally financed social insurance programs (for example Medicare) because of past medicinal services extortion. 

In [37]:
gcs_client = storage.Client()
bucket = gcs_client.bucket('shreneel-bigdata1')

list(bucket.list_blobs(prefix='UPDATED.csv'))
!hdfs dfs -ls 'gs://shreneel-bigdata1/UPDATED.csv'
IELE_rawdata = spark \
  .read \
  .option ( "inferSchema" , "true" ) \
  .option ( "header" , "true" ) \
  .csv ( "gs://shreneel-bigdata1/UPDATED.csv" )

IELE_rawdata.printSchema()  

-rwx------   3 root root   13969086 2023-05-04 01:25 gs://shreneel-bigdata1/UPDATED.csv
root
 |-- LASTNAME: string (nullable = true)
 |-- FIRSTNAME: string (nullable = true)
 |-- MIDNAME: string (nullable = true)
 |-- BUSNAME: string (nullable = true)
 |-- GENERAL: string (nullable = true)
 |-- SPECIALTY: string (nullable = true)
 |-- UPIN: string (nullable = true)
 |-- NPI: integer (nullable = true)
 |-- DOB: integer (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ZIP: string (nullable = true)
 |-- EXCLTYPE: string (nullable = true)
 |-- EXCLDATE: string (nullable = true)
 |-- REINDATE: integer (nullable = true)
 |-- WAIVERDATE: integer (nullable = true)
 |-- WVRSTATE: string (nullable = true)



In [38]:
npifraud_pd0 = IELE_rawdata.select('NPI', 'EXCLTYPE')
npifraud_pd0.show()

+----------+--------+
|       NPI|EXCLTYPE|
+----------+--------+
|         0|  1128a1|
|1972902351|  1128b8|
|         0|  1128a1|
|         0|  1128b7|
|1922348218|  1128a1|
|         0|  1128b5|
|         0|  1128a1|
|         0|  1128b8|
|         0|  1128a1|
|         0|  1128b8|
|         0|  1128b5|
|         0|  1128a1|
|         0|  1128b8|
|         0|  1128a1|
|         0|  1128a1|
|         0|  1128b4|
|         0|  1128a1|
|         0|  1128b8|
|         0|  1128a1|
|         0|  1128a1|
+----------+--------+
only showing top 20 rows



In [39]:
from pyspark.sql.functions import col


npifraud_pd1 = npifraud_pd0.filter(col('NPI') != 0)


In [40]:
npifraud_pd1.show()

+----------+---------+
|       NPI| EXCLTYPE|
+----------+---------+
|1972902351|   1128b8|
|1922348218|   1128a1|
|1942476080|   1128b8|
|1275600959|   1128a1|
|1891731758|   1128b8|
|1265830335|   1128a1|
|1851631543|   1128b7|
|1902198435|   1128a1|
|1073916631|   1128b7|
|1073682936|1128b7   |
|1902166028|   1128b8|
|1992906937|   1128b8|
|1104947944|   1128a1|
|1164669479|   1128a1|
|1043302250|   1128a1|
|1801231436|1128a1   |
|1912011800|   1128b8|
|1780812768|   1128b7|
|1447560867|   1128b8|
|1790963460|   1128b7|
+----------+---------+
only showing top 20 rows



In [41]:
rename_dict = {'NPI':'Prscrbr_NPI', 'EXCLTYPE':'is_fraud'}

npi_fraud_pd = npifraud_pd1.select([col(c).alias(rename_dict.get(c, c)) for c in npifraud_pd1.columns])


In [42]:
npi_fraud_pd.show()

+-----------+---------+
|Prscrbr_NPI| is_fraud|
+-----------+---------+
| 1972902351|   1128b8|
| 1922348218|   1128a1|
| 1942476080|   1128b8|
| 1275600959|   1128a1|
| 1891731758|   1128b8|
| 1265830335|   1128a1|
| 1851631543|   1128b7|
| 1902198435|   1128a1|
| 1073916631|   1128b7|
| 1073682936|1128b7   |
| 1902166028|   1128b8|
| 1992906937|   1128b8|
| 1104947944|   1128a1|
| 1164669479|   1128a1|
| 1043302250|   1128a1|
| 1801231436|1128a1   |
| 1912011800|   1128b8|
| 1780812768|   1128b7|
| 1447560867|   1128b8|
| 1790963460|   1128b7|
+-----------+---------+
only showing top 20 rows



In [43]:
from pyspark.sql.functions import lit

npi_fraud_pd = npi_fraud_pd.withColumn('is_fraud', lit(1))


In [44]:
npi_fraud_pd.show()

+-----------+--------+
|Prscrbr_NPI|is_fraud|
+-----------+--------+
| 1972902351|       1|
| 1922348218|       1|
| 1942476080|       1|
| 1275600959|       1|
| 1891731758|       1|
| 1265830335|       1|
| 1851631543|       1|
| 1902198435|       1|
| 1073916631|       1|
| 1073682936|       1|
| 1902166028|       1|
| 1992906937|       1|
| 1104947944|       1|
| 1164669479|       1|
| 1043302250|       1|
| 1801231436|       1|
| 1912011800|       1|
| 1780812768|       1|
| 1447560867|       1|
| 1790963460|       1|
+-----------+--------+
only showing top 20 rows



In [45]:
print(npi_fraud_pd.dtypes)


[('Prscrbr_NPI', 'int'), ('is_fraud', 'int')]


In [46]:
Features_pd1 = pay_partD_fpd.join(npi_fraud_pd, ['Prscrbr_NPI'], how='left')
Features_pd1.show()

+-----------+---------+----------+----------------+-----+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+--------------------+------------------+--------+
|Prscrbr_NPI|last_name|first_name|            city|state|  sum_tot_drug_cst|  avg_tot_drug_cst|max_tot_drug_cst|sum_tot_clms|      avg_tot_clms|max_tot_clms|sum_tot_day_suply| avg_tot_day_suply|max_tot_day_suply|          Speciality| Total_Payment_Sum|is_fraud|
+-----------+---------+----------+----------------+-----+------------------+------------------+----------------+------------+------------------+------------+-----------------+------------------+-----------------+--------------------+------------------+--------+
| 1093071367|   Abbott|     Laura|   San Francisco|   CA|          16670.63| 5556.876666666667|         9978.51|          65|21.666666666666668|          35|             6923|2307.6666666666665|             5003|St

In [47]:
Features_pd1.describe().show()

+-------+-------------------+-----------+----------+------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+-----------------+--------+
|summary|        Prscrbr_NPI|  last_name|first_name|  city| state|  sum_tot_drug_cst|  avg_tot_drug_cst|  max_tot_drug_cst|      sum_tot_clms|      avg_tot_clms|      max_tot_clms| sum_tot_day_suply| avg_tot_day_suply| max_tot_day_suply|      Speciality|Total_Payment_Sum|is_fraud|
+-------+-------------------+-----------+----------+------+------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----------------+-----------------+--------+
|  count|             893173|     893173|    893165|893173|893173|            893173|            893173|            893173|            893173|            

In [None]:
#Filling all the na values with 0

In [48]:
Features_pd1 = Features_pd1.fillna(0)

In [49]:
Features_pd1

DataFrame[Prscrbr_NPI: string, last_name: string, first_name: string, city: string, state: string, sum_tot_drug_cst: double, avg_tot_drug_cst: double, max_tot_drug_cst: double, sum_tot_clms: bigint, avg_tot_clms: double, max_tot_clms: int, sum_tot_day_suply: bigint, avg_tot_day_suply: double, max_tot_day_suply: int, Speciality: string, Total_Payment_Sum: double, is_fraud: int]

In [None]:
from pyspark.sql.functions import col

fraud_count = Features_pd1.filter(col('is_fraud') == 1).count()


In [None]:
fraud_count

In [None]:
FeaturesAll = Features_pd1

In [None]:
from pyspark.sql.functions import log10, col

# Scaling the features
FeaturesAll = FeaturesAll.withColumn('sum_tot_drug_cst', log10(col('sum_tot_drug_cst') + 1.0))
FeaturesAll = FeaturesAll.withColumn('sum_tot_clms', log10(col('sum_tot_clms') + 1.0))
FeaturesAll = FeaturesAll.withColumn('sum_tot_day_suply', log10(col('sum_tot_day_suply') + 1.0))
FeaturesAll = FeaturesAll.withColumn('Total_Payment_Sum', log10(col('Total_Payment_Sum') + 1.0))

FeaturesAll = FeaturesAll.withColumn('avg_tot_drug_cst', log10(col('avg_tot_drug_cst') + 1.0))
FeaturesAll = FeaturesAll.withColumn('avg_tot_clms', log10(col('avg_tot_clms') + 1.0))
FeaturesAll = FeaturesAll.withColumn('avg_tot_day_suply', log10(col('avg_tot_day_suply') + 1.0))

FeaturesAll = FeaturesAll.withColumn('max_tot_drug_cst', log10(col('max_tot_drug_cst') + 1.0))
FeaturesAll = FeaturesAll.withColumn('max_tot_clms', log10(col('max_tot_clms') + 1.0))
FeaturesAll = FeaturesAll.withColumn('max_tot_day_suply', log10(col('max_tot_day_suply') + 1.0))

FeaturesAll = FeaturesAll.withColumn('claim_max-mean', col('max_tot_clms') - col('avg_tot_clms'))
FeaturesAll = FeaturesAll.withColumn('supply_max-mean', col('max_tot_day_suply') - col('avg_tot_day_suply'))
FeaturesAll = FeaturesAll.withColumn('drug_max-mean', col('max_tot_drug_cst') - col('avg_tot_drug_cst'))



In [None]:
FeaturesAll

In [None]:
from pyspark.sql.functions import col

FeaturesAll = FeaturesAll.withColumn("Prscrbr_NPI", col("Prscrbr_NPI").cast("string"))


In [None]:
from pyspark.sql.types import StringType

categorical_features = ['Prscrbr_NPI', 'last_name', 'Speciality', 'first_name', 'city', 'state']

for feature in categorical_features:
    FeaturesAll = FeaturesAll.withColumn(feature, FeaturesAll[feature].cast(StringType()))


In [None]:
numerical_features = ['sum_tot_drug_cst', 'avg_tot_drug_cst','Total_Payment_Sum',
       'max_tot_drug_cst', 'sum_tot_clms',
       'avg_tot_clms', 'max_tot_clms',
       'sum_tot_day_suply', 'avg_tot_day_suply', 'max_tot_day_suply',
    'claim_max-mean','supply_max-mean', 'drug_max-mean']

In [None]:
target = ['is_fraud']


In [None]:
allvars = categorical_features + numerical_features + target


In [None]:
y = FeaturesAll.select("is_fraud").rdd.flatMap(lambda x: x).collect()
X = FeaturesAll.select([col(c) for c in allvars if c != 'is_fraud'])


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
from pyspark.ml.tuning import TrainValidationSplit

# select the numerical columns from the original dataframe
numerical_features = ['sum_tot_drug_cst', 'avg_tot_drug_cst','Total_Payment_Sum',
       'max_tot_drug_cst', 'sum_tot_clms',
       'avg_tot_clms', 'max_tot_clms',
       'sum_tot_day_suply', 'avg_tot_day_suply', 'max_tot_day_suply',
    'claim_max-mean','supply_max-mean', 'drug_max-mean']
X = FeaturesAll.select(numerical_features)

# convert numerical columns to double type
for feature in numerical_features:
    X = X.withColumn(feature, col(feature).cast(DoubleType()))

# combine features into a single vector column
vectorAssembler = VectorAssembler(inputCols=X.columns, outputCol="features_vec")
X = vectorAssembler.transform(X)

# split the data into train and validation sets
train, test = X.randomSplit([0.8, 0.2], seed=0)

# select the correct columns for input and output
X_train = train.select(X.columns)
X_valid = test.select(X.columns)
y_train = train.select("Total_Payment_Sum")
y_valid = test.select("Total_Payment_Sum")

print(X_train.count(), len(X_train.columns))
print(X_valid.count(), len(X_valid.columns))


In [None]:
from pyspark.sql.functions import col

# fill null values in numerical columns with 0
for feature in numerical_features:
    X_train = X_train.withColumn(feature, col(feature).cast("double"))
    X_valid = X_valid.withColumn(feature, col(feature).cast("double"))
    X_train = X_train.na.fill(0, [feature])
    X_valid = X_valid.na.fill(0, [feature])

# fill null values in categorical columns with 'NA'
for feature in categorical_features:
    if feature in X_train.columns:
        X_train = X_train.na.fill('NA', [feature])
    if feature in X_valid.columns:
        X_valid = X_valid.na.fill('NA', [feature])



In [None]:
from pyspark.sql.functions import col

X_train.select([col(col_name).cast("double").alias(col_name) for col_name in numerical_features]).dtypes



In [None]:

from pyspark.sql.functions import rand

df_len = FeaturesAll.count()
train_len = int(df_len * 0.8)

df_train = FeaturesAll.orderBy(rand()).limit(train_len)
df_valid = FeaturesAll.orderBy(rand()).exceptAll(df_train)

print(df_train.count())
print(df_valid.count())


In [None]:
df_train.printSchema()


In [None]:
from pyspark.sql.functions import col

partD_drug_train = partD_Drug_df.join(df_train.select('Prscrbr_NPI', 'is_fraud'), on=['Prscrbr_NPI'], how='inner')
partD_drug_all = partD_Drug_df.join(FeaturesAll.select('Prscrbr_NPI', 'is_fraud'), on=['Prscrbr_NPI'], how='inner')


In [None]:
print(partD_drug_train.filter(col("is_fraud") == 1).count())


In [None]:
print(partD_drug_train.filter(col("Brnd_name").isNotNull()).select("Brnd_name").distinct().count())




In [None]:
drugs = set(partD_drug_train.filter(col("Brnd_name").isNotNull()).select("Brnd_name").distinct().rdd.flatMap(lambda x: x).collect())
print(len(drugs))


In [None]:
# Total records in train set
print("Total records in train set : ")
print(partD_drug_train.count())

# Total Fraud in train set
print("Total Fraud in train set : ")
print(partD_drug_train.filter("is_fraud == 1").count())

# Show DataFrame
partD_drug_train.show()

In [None]:
# Define the feature columns to be used in the logistic regression model
feature_cols = ['Tot_Drug_Cst', 'Tot_Clms', 'Tot_Day_Suply']

# Create a vector assembler to assemble the features into a vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

In [None]:
# Define the feature columns to be used in the logistic regression model
feature_cols = ['Tot_Drug_Cst', 'Tot_Clms', 'Tot_Day_Suply']

# Create a vector assembler to assemble the features into a vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Apply the vector assembler to the training data
train_data = assembler.transform(partD_drug_train).select("features", "is_fraud")

# Split the data into training and test sets
train_set, test_set = train_data.randomSplit([0.7, 0.3], seed=12345)

In [None]:
num_train_data = train_set.count()
print("Number of data in train_set:", num_train_data)


In [None]:
################################################

In [73]:
 import time


In [78]:
# Create a logistic regression model with default parameters
lr = LogisticRegression(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
lr_model = lr.fit(train_set)
end = time.time()
print(f"Time to train logistic regression model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = lr_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Time to train logistic regression model: 185.1697 seconds
Time to make predictions on test set: 0.0242 seconds
Accuracy: 0.5947
F1 Score: 0.9965


In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a Naive Bayes model with default parameters
nb = NaiveBayes(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
nb_model = nb.fit(train_set)
end = time.time()
print(f"Time to train Naive Bayes model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = nb_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Time to train Naive Bayes model: 164.7756 seconds
Time to make predictions on test set: 0.0217 seconds
Accuracy: 0.4857
F1 Score: 0.8721


In [None]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create a GBTClassifier model with default parameters
gbt = GBTClassifier(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
gbt_model = gbt.fit(train_set)
end = time.time()
print(f"Time to train GBTClassifier model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = gbt_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create a RandomForestClassifier model with default parameters
rf = RandomForestClassifier(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
rf_model = rf.fit(train_set)
end = time.time()
print(f"Time to train RandomForestClassifier model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = rf_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Time to train RandomForestClassifier model: 403.8538 seconds
Time to make predictions on test set: 0.0342 seconds


KeyboardInterrupt: 

In [76]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create a DecisionTreeClassifier model with default parameters
dt = DecisionTreeClassifier(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
dt_model = dt.fit(train_set)
end = time.time()
print(f"Time to train DecisionTreeClassifier model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = dt_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


KeyboardInterrupt: 

Exception ignored in: <module 'threading' from '/opt/conda/anaconda/lib/python3.7/threading.py'>
Traceback (most recent call last):
  File "/opt/conda/anaconda/lib/python3.7/threading.py", line 1308, in _shutdown
    lock.acquire()
  File "/usr/lib/spark/python/pyspark/context.py", line 270, in signal_handler
    raise KeyboardInterrupt()
KeyboardInterrupt: 


In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create a RandomForestClassifier model with default parameters
rf = RandomForestClassifier(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
rf_model = rf.fit(train_set)

# Make predictions on the test set
predictions = rf_model.transform(test_set)

# Evaluate the model using binary classification metrics
binary_evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = binary_evaluator.evaluate(predictions)

# Evaluate the model using F1 score
multi_evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')
f1_score = multi_evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")


In [None]:
#####################################################################################################################################

In [75]:
# Create a logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
lr_model = lr.fit(train_set)
end = time.time()
print(f"Time to train logistic regression model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = lr_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")

NameError: name 'train_set' is not defined

In [None]:
from pyspark.ml.classification import NaiveBayes

# Create a Naive Bayes model
nb = NaiveBayes(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
nb_model = nb.fit(train_set)
end = time.time()
print(f"Time to train Naive Bayes model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = nb_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")

Time to train Naive Bayes model: 194.1303 seconds
Time to make predictions on test set: 0.0233 seconds
Accuracy: 0.4872


In [102]:
from pyspark.ml.classification import GBTClassifier

# Create a GBTClassifier model
gbt = GBTClassifier(featuresCol='features', labelCol='is_fraud')

# Train the model using the training set
start = time.time()
gbt_model = gbt.fit(train_set)
end = time.time()
print(f"Time to train GBTClassifier model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = gbt_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")


Time to train GBTClassifier model: 294.2747 seconds
Time to make predictions on test set: 0.0312 seconds
Accuracy: 0.6095


In [103]:

from pyspark.ml.classification import RandomForestClassifier
# Split the data into training and test sets
start = time.time()
train_set, test_set = train_data.randomSplit([0.7, 0.3], seed=12345)
end = time.time()
print(f"Time to split data into training and test sets: {end - start:.4f} seconds")

# Create a Random Forest model
rf = RandomForestClassifier(
    featuresCol='features', 
    labelCol='is_fraud',
    numTrees=10
)

# Train the model using the training set
start = time.time()
rf_model = rf.fit(train_set)
end = time.time()
print(f"Time to train Random Forest model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = rf_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")


Time to split data into training and test sets: 0.0111 seconds
Time to train Random Forest model: 209.7570 seconds
Time to make predictions on test set: 0.0383 seconds
Accuracy: 0.5983


In [None]:
from pyspark.ml.classification import DecisionTreeClassifier


# Split the data into training and test sets
start = time.time()
train_set, test_set = train_data.randomSplit([0.7, 0.3], seed=12345)
end = time.time()
print(f"Time to split data into training and test sets: {end - start:.4f} seconds")

# Create a DecisionTreeClassifier model
dt = DecisionTreeClassifier(
    featuresCol='features', 
    labelCol='is_fraud',
    maxDepth=5,
    maxBins=32,
    minInstancesPerNode=1,
    impurity='gini'
)

# Train the model using the training set
start = time.time()
dt_model = dt.fit(train_set)
end = time.time()
print(f"Time to train DecisionTreeClassifier model: {end - start:.4f} seconds")

# Make predictions on the test set
start = time.time()
predictions = dt_model.transform(test_set)
end = time.time()
print(f"Time to make predictions on test set: {end - start:.4f} seconds")

# Evaluate the model using binary classification metrics
evaluator = BinaryClassificationEvaluator(labelCol='is_fraud')
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")

Time to split data into training and test sets: 0.0090 seconds
Time to train DecisionTreeClassifier model: 213.2026 seconds
Time to make predictions on test set: 0.0250 seconds
Accuracy: 0.5289


In [None]:
#####################################################################################################################################################################################################

F1 Score

SyntaxError: trailing comma not allowed without surrounding parentheses (<ipython-input-58-955ca27bacc0>, line 3)