In [24]:
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [25]:
def read_file(train_path="data_set/train.csv", test_path="data_set/test1.csv"):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    df1 = train.drop(["Unnamed: 0"], axis=1)
    df2 = test.drop(["Unnamed: 0"], axis=1)
    df2["label"] = -1
    
    for col in ["android_id", "apptype", "carrier", "ntt", "media_id", "cus_type", "package", "location"]:
        df1[col] = df1[col].astype("category")
        df2[col] = df2[col].astype("category")
        
    for col in ["fea_hash"]:
        df1[col] = df1[col].map(lambda x: 0 if len(str(x)) > 16 else int(x))
        df2[col] = df2[col].map(lambda x: 0 if len(str(x)) > 16 else int(x))
        
    for col in ["dev_height", "dev_ppi", "dev_width", "fea_hash", "fea1_hash","label"]:
        df1[col] = df1[col].astype("int64")
        df2[col] = df2[col].astype("int64")
        
    df1["time"] = pd.to_datetime(df1["timestamp"], unit="ms", origin=pd.Timestamp("1970-01-01"))
    df2["time"] = pd.to_datetime(df2["timestamp"], unit="ms", origin=pd.Timestamp("1970-01-01"))
    
    df1["day"] = df1.time.dt.day
    df2["day"] = df2.time.dt.day

    df1["hour"] = df1.time.dt.hour
    df2["hour"] = df2.time.dt.hour

    df1["minute"] = df1.time.dt.minute
    df2["minute"] = df2.time.dt.minute

    df1.set_index("sid", drop=True, inplace=True)
    df2.set_index("sid", drop=True, inplace=True)
    
    df1.dev_height[df1.dev_height == 0] = None
    df1.dev_width[df1.dev_width == 0] = None
    df1.dev_ppi[df1.dev_ppi == 0] = None
    df2.dev_height[df2.dev_height == 0] = None
    df2.dev_width[df2.dev_width == 0] = None
    df2.dev_ppi[df2.dev_ppi == 0] = None
    return df1, df2

def process_cate(df1,df2,col):
    le = preprocessing.LabelEncoder()
    
    df1[col] = le.fit_transform(df1[col])
    df1[col] = df1[col].astype("category")
    df2[col] = le.transform(df2[col])
    df2[col] = df2[col].astype("category")
    return df1, df2

def dict_cate(df1,df2,col,dic):
    df1[col] = df1[col].map(dic)
    df1[col] = df1[col].astype("category")
    df2[col] = df2[col].map(dic)
    df2[col] = df2[col].astype("category")
    return df1,df2

def remove_special(df1,df2,col,tops,inplace = -1):
    idx_sets = set(df1[col].value_counts().head(tops).index)

    def helper(x):
        if x in idx_sets:
            return x
        else:
            return inplace

    df1[col] = df1[col].apply(helper)
    df2[col] = df2[col].apply(helper)
    return df1, df2

def process_sp_cate(df1,df2,col): 
    if col == "apptype":
        df1, df2 = remove_special(df1, df2, col, 75, -1)
    if col == "media_id":
        df1, df2 = remove_special(df1, df2, col, 200, -1)
    if col == "version":
        df2[col] = df2[col].replace("20", "0").replace("21", "0")
    if col == "lan":
        lan_set = set(df1[col].value_counts().head(12).index)
        def foreign_lan(x):
            native_lan = {'zh-CN', 'zh', 'cn', 'zh_CN', 'Zh-CN', 'zh-cn', 'ZH', 'CN', 'zh_CN_#Hans'}
            if x in native_lan:
                return 0
            elif x == "unk":
                return 2
            else:
                return 1
        df1["f_lan"] = df1["lan"].apply(foreign_lan)
        df2["f_lan"] = df2["lan"].apply(foreign_lan)
        def helper_lan(x):
            if x in lan_set:
                return x
            else:
                return "unk"
        df1[col] = df1[col].apply(helper_lan)
        df2[col] = df2[col].apply(helper_lan)

    if col == "package":
        df1, df2 = remove_special(df1, df2, col, 800, -1)

    if col == "fea1_hash":
        df1, df2 = remove_special(df1, df2, col, 850, -1)

    if col == "fea_hash":
        df1, df2 = remove_special(df1, df2, col, 850, -1)
    df1, df2 = process_cate(df1, df2, col)
    return df1,df2

def process_osv(df1, df2):
    def helper(x):
        x = str(x)
        if not x:
            return -1
        elif x.startswith("Android"):
            x = str(re.findall("\d{1}\.*\d*\.*\d*",x)[0])
            return x
        elif x.isdigit():
            return x
        else:
            try:
                x = str(re.findall("\d{1}\.\d\.*\d*", x)[0])
                return x
            except:
                return 0
    df1.osv = df1.osv.apply(helper)
    df2.osv = df2.osv.apply(helper)

    osv_set = set(df1["osv"].value_counts().head(70).index)
    def helper2(x):
        if x in osv_set:
            return x
        else:
            return 0
    df1["osv"] = df1["osv"].apply(helper2)
    df2["osv"] = df2["osv"].apply(helper2)
    le = preprocessing.LabelEncoder()
    df1.osv = le.fit_transform(df1.osv.astype("str"))
    df1["osv"] = df1["osv"].astype("category")
    df2.osv = le.transform(df2.osv.astype("str"))
    df2["osv"] = df2["osv"].astype("category")
    return df1,df2

In [26]:
def rf(df1,df2):
    #缺失值填补
    c1 = df1.dev_width.notnull()
    c2 = df1.dev_height.notnull()
    c3 = df1.dev_ppi.isna()
    c4 = df1.dev_ppi.notnull()
    df1["noppi"] = c1 & c2 & c3
    df1["notnull"] = c1 & c2 & c4
    
    predict = df1[["apptype", "carrier", "dev_height", "dev_ppi", "dev_width", "media_id", "ntt", "noppi", "notnull"]]
    
    df_notnans = predict[predict["notnull"] == True]
    
    X_train, X_test, y_train, y_test = train_test_split(
        df_notnans[["apptype", "carrier", "dev_height", "dev_width", "media_id", "ntt"]], df_notnans["dev_ppi"],
        train_size=0.75, random_state=6)
    rf = RandomForestClassifier(n_estimators=100, max_depth=40, random_state=0, n_jobs=-1)
    rf.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    
    df_nans = predict[predict["noppi"] == True].copy()
    df_nans["dev_ppi_pred"] = rf.predict(df_nans[["apptype", "carrier", "dev_height", "dev_width", "media_id", "ntt"]])
    df1 = pd.merge(df1,df_nans[["dev_ppi_pred"]],on = "sid",how = "left")
    
    c1 = df2.dev_width.notnull()
    c2 = df2.dev_height.notnull()
    c3 = df2.dev_ppi.isna()
    c4 = df2.dev_ppi.notnull()
    df2["noppi"] = c1 & c2 & c3
    df2["notnull"] = c1 & c2 & c4
    
    predict_test = df2[
        ["apptype", "carrier", "dev_height", "dev_ppi", "dev_width", "media_id", "ntt", "noppi", "notnull"]]
    df_nans = predict_test[predict_test["noppi"] == True].copy()
    df_nans["dev_ppi_pred"] = rf.predict(
        df_nans[["apptype", "carrier", "dev_height", "dev_width", "media_id", "ntt"]])
    df2 = pd.merge(df2, df_nans[["dev_ppi_pred"]], on="sid", how="left")
    
    def fill_pii(df):
        a = df.dev_ppi.fillna(0).values
        b = df.dev_ppi_pred.fillna(0).values
        c = []
        # print(a,b)
        for i in range(len(a)):
            c.append(max(a[i], b[i]))
        c = np.array(c)
        df["final_ppi"] = c
        df["final_ppi"][df["final_ppi"] == 0] = None
        return df
    df1 = fill_pii(df1)
    df2 = fill_pii(df2)
    return df1,df2

In [27]:
def feature(df1, df2):
    def divided(x):
        if x % 40 == 0:
            return 2
        elif not x:
            return 1
        else:
            return 0

    df1["160_height"] = df1.dev_height.apply(divided)
    df2["160_height"] = df2.dev_height.apply(divided)
    df1["160_width"] = df1.dev_width.apply(divided)
    df2["160_width"] = df2.dev_width.apply(divided)
    df1["160_ppi"] = df1.final_ppi.apply(divided)
    df2["160_ppi"] = df2.final_ppi.apply(divided)
    df1["hw_ratio"] = df1.dev_height / df1.dev_width
    df2["hw_ratio"] = df2.dev_height / df2.dev_width
    df1["hw_matrix"] = df1.dev_height * df1.dev_width
    df2["hw_matrix"] = df2.dev_height * df2.dev_width
    df1["inch"] = (df1.dev_height ** 2 + df1.dev_width ** 2) ** 0.5 / df1.final_ppi
    df2["inch"] = (df2.dev_height ** 2 + df2.dev_width ** 2) ** 0.5 / df2.final_ppi
    return df1, df2

In [28]:
train, test = read_file()
for col in ["location", "os", "ntt", "cus_type"]:
    train, test = process_cate(train,test,col)
train, test = dict_cate(train,test,"carrier",{-1.0:-1, 0.0:0, 46000.0:1, 46001.0:2, 46003.0:3})
for col in ["apptype", "media_id", "version", "lan", "package", "fea1_hash", "fea_hash"]:
    train, test = process_sp_cate(train,test,col)
train, test = process_osv(train,test)

In [29]:
train, test = rf(train, test)

In [30]:
train, test = feature(train, test)

In [31]:
features = ['android_id', 'apptype', 'carrier', 'dev_height', 'dev_ppi',
       'dev_width','lan', 'media_id', 'ntt', 'os', 'osv', 'package',
       'timestamp', 'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type',
       'day', 'hour', 'minute', 'f_lan', 'noppi', 'notnull',
       'dev_ppi_pred', 'final_ppi', '160_height', '160_width', '160_ppi',
       'hw_ratio', 'hw_matrix', 'inch']

cate_feat = ['apptype', 'carrier', 'lan', 'media_id', 'ntt', 'os', 
       'osv', 'package','timestamp', 'version', 'fea_hash', 'location', 'fea1_hash', 'cus_type',
       'day', 'hour', 'minute', 'f_lan', 'noppi', 'notnull']

In [32]:
def get_model_type():
    model = lgb.LGBMRegressor(
        num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, 
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2021,
        n_estimators=600, subsample=0.9, colsample_bytree=0.7,)
    return model

def predict_model(train,test, features, cate_feat):
    df1 = train.copy()
    df2 = test.copy()
    model = get_model_type()
    model.fit(df1[features], df1["label"], categorical_feature = cate_feat, verbose = 100)
    df2["label"] = model.predict(df2[features])
    predict = df2[["label"]]
    return predict

In [33]:
train["android_id"]= train["android_id"].astype("int")
test["android_id"]= test["android_id"].astype("int")

train = train.drop(["time"],axis=1)
test = test.drop(["time"],axis=1)

In [34]:
predict = predict_model(train,test,features,cate_feat)
predict

Unnamed: 0_level_0,label
sid,Unnamed: 1_level_1
1440682,0.072827
1606824,0.732386
1774642,0.021766
1742535,0.038959
1689686,0.940323
...,...
1165373,0.943027
1444115,0.956365
1134378,0.941794
1700238,0.983145


In [35]:
def helper(x):
        if x > 0.5:
            return 1
        else: 
            return 0
predict["label"] = predict["label"].apply(helper)
predict["label"].value_counts()

0    79648
1    70352
Name: label, dtype: int64

In [36]:
train.label.value_counts()

0    257760
1    242240
Name: label, dtype: int64

In [37]:
predict = predict.reset_index(drop = False)
predict

Unnamed: 0,sid,label
0,1440682,0
1,1606824,1
2,1774642,0
3,1742535,0
4,1689686,1
...,...,...
149995,1165373,1
149996,1444115,1
149997,1134378,1
149998,1700238,1


In [38]:
predict.to_csv("predicts.csv", index=False)