In [12]:
# import library
from pathlib import Path
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [13]:
with open("header_names.txt", "r") as file:
    header_names = file.read().splitlines()
header_names


['Hvid',
 'cmnid',
 'AsliCmnId',
 'birthyear',
 'birthmonth',
 'birthday',
 'OstanCmn',
 'CityCmn',
 'sodurplace',
 'CmnCityName',
 'jens',
 'OstanNameService',
 'nesbattext',
 'bimari',
 'bimaritext',
 'bimaridetail',
 'bimaridetailtext',
 'Pezeshk',
 'ToothNo',
 'ToothText',
 'darkhastisumamount',
 'franshiz',
 'DmgAmount',
 'SumAmount',
 'KosooratSharh',
 'HadeseDate',
 'BeginDate',
 'EndDat']

In [14]:
dataset_dir = Path("/home/sadegh/datasets")
chunk = pd.read_csv(
    dataset_dir / "hashtak.csv",
    chunksize=1000000,
    delimiter="\t",
    names=header_names,
    low_memory=False,
)


In [15]:
df = chunk.get_chunk()


In [22]:
df.replace(0, np.nan, inplace=True)
df.replace("0", np.nan, inplace=True)
df.replace("\\N", np.nan, inplace=True)
df.replace(r"\\", np.nan, inplace=True)
df


Unnamed: 0,Hvid,cmnid,AsliCmnId,birthyear,birthmonth,birthday,OstanCmn,CityCmn,sodurplace,CmnCityName,jens,OstanNameService,nesbattext,bimari,bimaritext,bimaridetail,bimaridetailtext,Pezeshk,ToothNo,ToothText,darkhastisumamount,franshiz,DmgAmount,SumAmount,KosooratSharh,HadeseDate,BeginDate,EndDat
0,889842,1378346,1378346,1363,9,11,,,محلات,,mard,,سرپرست,2306.0,جراحيهاي مجاز سرپايي,1304,جراحي هاي مجاز سرپايي,,,,3000000.0000,,3000000.0,3000000.0,,1396/08/20,1396/01/01,1397/01/01
1,891188,1378346,1378346,1363,9,11,,,محلات,,mard,,سرپرست,2533.0,دندانپزشکي(پايه),2393,درمان ريشه مجدد چهار کانال دندان هشت - متخصص,,46,بالا چپ 64D,6300000.0000,,6300000.0,6300000.0,,1396/09/22,1396/01/01,1397/01/01
2,900362,1378346,1378346,1363,9,11,,,محلات,,mard,,سرپرست,2676.0,آزمايش,2147,آزمايشگاه - متفرقه,نظام پزشکي نا مشخص,,,1513000.0000,,1513000.0,1513000.0,,1396/09/03,1396/01/01,1397/01/01
3,1065367,1378346,1378346,1363,9,11,,,محلات,,mard,,سرپرست,2893.0,دندانپزشکي(ايمپلنت),2924,اوردنچر روي چهار واحد ايمپلنت هر فک بابار اتچم...,,50,بالا چپ 65E,6000000.0000,,6000000.0,6000000.0,,1398/03/25,1398/01/01,1399/01/01
4,953346,1378346,1378346,1363,9,11,,,محلات,,mard,,سرپرست,2533.0,دندانپزشکي(پايه),2393,درمان ريشه مجدد چهار کانال دندان هشت - متخصص,نظام پزشکي نا مشخص,50,بالا چپ 65E,1300000.0000,,1300000.0,1300000.0,,1397/05/08,1397/01/01,1398/01/01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,243767,1422257,1422256,1353,,,,,97230,,zan,,همسر,2308.0,ويزيت تخصصي,1345,ويزيت تخصصي,,,,150000.0000,,150000.0,150000.0,,1391/09/06,1391/02/01,1392/02/01
999996,243767,1422257,1422256,1353,,,,,97230,,zan,,همسر,2309.0,ويزيت فوق تخصصي,1344,ويزيت فوق تخصصي,,,,180000.0000,,180000.0,180000.0,,1391/08/28,1391/02/01,1392/02/01
999997,243767,1422257,1422256,1353,,,,,97230,,zan,,همسر,2124.0,داروي مصرفي,1348,داروي مصرفي,,,,3330.0000,,3330.0,3330.0,,1391/09/09,1391/02/01,1392/02/01
999998,243767,1422257,1422256,1353,,,,,97230,,zan,,همسر,2309.0,ويزيت فوق تخصصي,1344,ويزيت فوق تخصصي,,,,180000.0000,,180000.0,180000.0,,1391/09/09,1391/02/01,1392/02/01


In [25]:
df['birthyear'].astype(np.float64)

ValueError: could not convert string to float: '1394/01/01'

In [18]:
df.dtypes

Hvid                   object
cmnid                  object
AsliCmnId              object
birthyear              object
birthmonth             object
birthday               object
OstanCmn               object
CityCmn                object
sodurplace             object
CmnCityName            object
jens                   object
OstanNameService       object
nesbattext             object
bimari                float64
bimaritext             object
bimaridetail           object
bimaridetailtext       object
Pezeshk                object
ToothNo                object
ToothText              object
darkhastisumamount     object
franshiz              float64
DmgAmount             float64
SumAmount             float64
KosooratSharh          object
HadeseDate             object
BeginDate              object
EndDat                 object
dtype: object

In [20]:
df = chunk.get_chunk()

to_drop_columns = {
    "index": [
        "Hvid",
        "cmnid",
        "AsliCmnId",
        "bimari",
        "bimaridetail",
    ],
    "nan": [
        "birthmonth",
        "birthday",
        "OstanCmn",
        "CityCmn",
        "sodurplace",
        "CmnCityName",
        "OstanNameService",
        "Pezeshk",
        "ToothNo",
        "ToothText",
        "KosooratSharh",
    ],
    "financial": [
        "darkhastisumamount",
        "franshiz",
        "SumAmount",
        "SumAmount",
    ],
    "date": [
        "BeginDate",
        "EndDat",
    ],
}

df.drop(
    [
        *to_drop_columns["index"],
        *to_drop_columns["nan"],
        *to_drop_columns["financial"],
        *to_drop_columns["date"],
    ],
    axis=1,
    inplace=True,
)
df.replace(0, np.nan, inplace=True)
df.replace("\\N", np.nan, inplace=True)
df.replace("0", np.nan, inplace=True)
df.dropna(axis=0, inplace=True)
df.drop_duplicates(ignore_index=True, inplace=True)
df

X = df.drop(["bimaritext", "bimaridetailtext", "DmgAmount", "HadeseDate"], axis=1)
y = df.bimaritext


In [21]:
ros = RandomOverSampler(random_state=42)

# fit predictor and target variable
X_ros, y_ros = ros.fit_resample(X, y)

print("Original dataset shape", y.shape)
print("Resample dataset shape", y_ros.shape)


Original dataset shape (884887,)
Resample dataset shape (29453040,)


In [22]:
y_ros.value_counts()


فيزيوتراپي                     193770
کپسول اکسيژن                   193770
راديوگرافي باريم انما          193770
آنژيوگرافي                     193770
ويزيت فوق تخصص روانپزشکي       193770
                                ...  
ويزيت دندانپزشک                193770
آزمايشگاه - بررسي کروموزومي    193770
دندانپزشکي(ايمپلنت)            193770
کمربندطبي                      193770
داروهاي غير بستري ام اس        193770
Name: bimaritext, Length: 152, dtype: int64

In [24]:
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), ("scale", StandardScaler())]
)
cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns

full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

xgb_cl = xgb.XGBClassifier()
X_processed = full_processor.fit_transform(X)
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, test_size=1 / 3, random_state=42
)


In [26]:
# Init classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)




0.21933937476903884

In [17]:
import pandas as pd
import numpy as np


corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,bimari,franshiz,DmgAmount,SumAmount
bimari,1.0,-0.050956,0.094871,0.004446
franshiz,-0.050956,1.0,0.873211,0.424929
DmgAmount,0.094871,0.873211,1.0,0.043846
SumAmount,0.004446,0.424929,0.043846,1.0


In [32]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    classification_report,
    confusion_matrix,
)

f1_score(y_test, preds, average="micro"), accuracy_score(
    y_test, preds
), precision_score(y_test, preds, average="micro"), recall_score(
    y_test, preds, average="micro"
),


(0.21933937476903886,
 0.21933937476903884,
 0.21933937476903884,
 0.21933937476903884)

In [29]:
sklearn.metrics.accuracy_score(y_test, preds)

0.21933937476903884

In [30]:
sklearn.metrics.precision_recall_fscore_support(y_test, preds)

  _warn_prf(average, modifier, msg_start, len(result))


(array([0.        , 0.        , 0.        , 0.14150943, 0.17088608,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.16666667, 0.27586207, 0.08      , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.15384615, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.2198675 ,
        0.        , 0.        , 0.        , 0.  

In [None]:
# steps = int(18e6) // 100000
# nans = 0
# for i in tqdm(range(steps)):
#     df = chunk.get_chunk()
#     df.replace('\\N', np.nan, inplace=True)
#     nans += df.isna().mean() * 100

# nans = nans / steps
# nans


In [None]:
# nans/170


Hvid                   0.000000
cmnid                  0.000135
AsliCmnId              0.000135
birthyear              0.000135
birthmonth            36.237682
birthday              36.330394
OstanCmn              61.101929
CityCmn               61.101929
sodurplace            36.461335
CmnCityName           61.101929
jens                   0.000176
OstanNameService      80.107459
nesbattext             0.000176
bimari                 0.000176
bimaritext             0.000176
bimaridetail           0.283912
bimaridetailtext       1.783988
Pezeshk               87.693841
ToothNo               88.028471
ToothText             92.724188
darkhastisumamount     0.006771
franshiz               0.000176
DmgAmount              0.000176
SumAmount              0.000176
KosooratSharh         99.972553
HadeseDate             0.000218
BeginDate              0.000218
EndDat                 0.000218
dtype: float64