In [None]:
#@title タイタニック
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # データセット分割用
from sklearn.metrics import accuracy_score
from pylab import rcParams
from sklearn.metrics import confusion_matrix
np.set_printoptions(suppress=True)#指数表記禁止

import IPython
def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# 特徴量重要度を棒グラフでプロットする関数 
def plot_feature_importance(df): 
  n_features = len(df)                              # 特徴量数(説明変数の個数) 
  df_plot = df.sort_values('importance')            # df_importanceをプロット用に特徴量重要度を昇順ソート 
  f_importance_plot = df_plot['importance'].values  # 特徴量重要度の取得 
  plt.barh(range(n_features), f_importance_plot, align='center') 
  cols_plot = df_plot['feature'].values             # 特徴量の取得 
  plt.yticks(np.arange(n_features), cols_plot)      # x軸,y軸の値の設定
  plt.xlabel('Feature importance')                  # x軸のタイトル
  plt.ylabel('Feature')                             # y軸のタイトル


#データよみこみ
pretest  = pd.read_csv(filepath_or_buffer="/content/test.csv")
pretrain = pd.read_csv(filepath_or_buffer="/content/train.csv")

#扱いづらいデータを削除
traindata = pretrain.drop(['Name','Cabin','Ticket','Fare','PassengerId','Parch'], axis=1)
testdata  = pretest.drop(['Name','Cabin','Ticket','Fare','PassengerId','Parch'], axis=1)

#欠損値処理
np.nan_to_num(traindata['Age'], copy=False)
np.nan_to_num(testdata['Age'], copy=False)

#male or female
SEX_le = LabelEncoder()
traindata['Sex'] = SEX_le.fit_transform(traindata['Sex'])
testdata['Sex']  = SEX_le.fit_transform(testdata['Sex'])

#embarked
eb_le = LabelEncoder()
traindata['Embarked'] = eb_le.fit_transform(traindata['Embarked'])
testdata['Embarked']  = eb_le.fit_transform(testdata['Embarked'])

#Ageを整数型に変換
traindata['Age'] = traindata['Age'].astype('int')
testdata['Age']  = testdata['Age'].astype('int')


# 説明変数,目的変数, テストケース分割
X = traindata.drop('Survived',axis=1).values # 説明変数
y = traindata['Survived'].values             # 目的変数
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10, random_state=2,stratify=y) 
#↑学習用データ：テストデータ=8:2になるようにしている



# 学習に使用するデータを設定
putilgb_train = lgb.Dataset(X_train, y_train)
putilgb_eval = lgb.Dataset(X_test, y_test, reference=putilgb_train) 


# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective':'mape',
        'metric': {'rmse'}, # 評価指標 : rsme(平均二乗誤差の平方根) 
        'learning_rate': 0.0001,
        'num_iterations':50000,  
        'num_leaves':70,
}



# モデルの学習
model = lgb.train(params,
                  train_set=putilgb_train, # トレーニングデータの指定
                  num_boost_round=1000,
                  valid_sets=putilgb_eval, # 検証データの指定
                  early_stopping_rounds=20,
                  verbose_eval=0
                  )





pred = model.predict(testdata)

for i in range(len(pred)):
  
  if pred[i] > 0.80:
    pred[i] = 1

  else:
    pred[i] = 0


pred   = pred.astype('int')

pre_ans = np.stack([pretest["PassengerId"], pred], 1)
print(pre_ans)

ans = pd.DataFrame(pre_ans, columns=['PassengerId', 'Survived'])
#ans = ans.drop(ans.columns[[0]] , axis=1)
ans.to_csv('/content/ans3.csv')

#np.savetxt('/content/ans.csv', pre_ans, delimiter=',' , fmt='%d')


#予測値の表示
#survive_pred = model.predict(X_test,num_iteration=model.best_iteration)

survive_pred = pred
print(pred )


# 予測結果を変換
y_pred_fin=[]
for x in survive_pred:
    y_pred_fin.append(round(x))

#混合行列作成
print(confusion_matrix(y_test, y_pred_fin))

# 評価
print(accuracy_score(y_test, y_pred_fin))



#特徴量重要度を算出するだけの部分なので、採用する変数を決定してしまったら用済み
## 特徴量重要度の算出 (データフレームで取得)
cols = list(traindata.drop('Survived',axis=1).columns)       

# 特徴量重要度の算出方法 'gain'(推奨) : トレーニングデータの損失の減少量を評価
f_importance = np.array(model.feature_importance(importance_type='gain')) # 特徴量重要度の算出 //
f_importance = f_importance / np.sum(f_importance) # 正規化(必要ない場合はコメントアウト)
df_importance = pd.DataFrame({'feature':cols, 'importance':f_importance})
df_importance = df_importance.sort_values('importance', ascending=False) # 降順ソート
display(df_importance)
plot_feature_importance(df_importance)


In [None]:
#@title タイタニック　アンサンブルver

#@title 国税調査からの年収予測 Part2

#import
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss  #対数損失のインポート
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb


#データ読み込み
pretrain = pd.read_csv("/content/train.csv") # 学習用データ
pretest = pd.read_csv("/content/test.csv")   # 評価用データ
sample_submit = pd.read_csv("/content/sample_submit.csv", header=None) # 応募用サンプルファイル

#訓練データに改変
train_x = pretrain.drop(['Y'],axis=1)
train_y = pretrain['Y']
test_x  = pretest.copy()

#LabelEncoder
for c in ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']:
  le = LabelEncoder()
  le.fit(train_x[c].fillna('NA'))
  
  train_x[c] = le.transform(train_x[c].fillna('NA'))
  test_x[c]  = le.transform(test_x[c].fillna('NA'))



model = XGBClassifier(n_estimatiors=20,random_state=7)
model.fit(train_x,train_y)

pred_xgb = model.predict_proba(test_x)[:,1]


# -----------------------------------
# ロジスティック回帰用の特徴量の作成
# -----------------------------------
from sklearn.preprocessing import OneHotEncoder

# 元データをコピーする
train_x2 = pretrain.drop(['Y'], axis=1)
test_x2 = pretest.copy()

# one-hot encodingを行う
cat_cols = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(train_x2[cat_cols].fillna('NA'))

# one-hot encodingのダミー変数の列名を作成する
ohe_columns = []
for i, c in enumerate(cat_cols):
    ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]

# one-hot encodingによる変換を行う
ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns)

# one-hot encoding済みの変数を除外する
train_x2 = train_x2.drop(cat_cols, axis=1)
test_x2 = test_x2.drop(cat_cols, axis=1)

# one-hot encodingで変換された変数を結合する
train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1)
test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1)


# -----------------------------------
# アンサンブル
# -----------------------------------

# ロジスティック回帰モデル
# xgboostモデルとは異なる特徴量を入れる必要があるので、別途train_x2, test_x2を作成した
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)
pred_lr = model_lr.predict_proba(test_x2)[:, 1]

# 予測値の加重平均をとる
pred = pred_xgb * 0.8 + pred_lr * 0.2
pred_label = np.where(pred > 0.5, 1, 0)

submission = pd.DataFrame({'index':test_x['index'], 'Y':pred_label})
submission.to_csv('/content/ans2.csv',index=False)


In [None]:
#@title Player Contact Detection　締め切り2023/03/01

import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # データセット分割用
from sklearn.metrics import accuracy_score
from pylab import rcParams
from sklearn.metrics import confusion_matrix
np.set_printoptions(suppress=True)#指数表記禁止

import IPython
def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# 特徴量重要度を棒グラフでプロットする関数 
def plot_feature_importance(df): 
  n_features = len(df)                              # 特徴量数(説明変数の個数) 
  df_plot = df.sort_values('importance')            # df_importanceをプロット用に特徴量重要度を昇順ソート 
  f_importance_plot = df_plot['importance'].values  # 特徴量重要度の取得 
  plt.barh(range(n_features), f_importance_plot, align='center') 
  cols_plot = df_plot['feature'].values             # 特徴量の取得 
  plt.yticks(np.arange(n_features), cols_plot)      # x軸,y軸の値の設定
  plt.xlabel('Feature importance')                  # x軸のタイトル
  plt.ylabel('Feature')                             # y軸のタイトル


In [None]:
#@title Playground Series - Season 3, Episode 5 アンサンブルver

#import
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss  #対数損失のインポート
from sklearn.model_selection import KFold
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb


#データ読み込み
pretrain = pd.read_csv("/content/train.csv") # 学習用データ
pretest = pd.read_csv("/content/test.csv")   # 評価用データ
#sample_submit = pd.read_csv("/content/sample_submit.csv", header=None) # 応募用サンプルファイル

#訓練データに改変
train_x = pretrain.drop(['quality'],axis=1)
train_y = pretrain['quality']
test_x  = pretest.copy()

#LabelEncoder
#for c in ['fixed acidity', 'volatile acidity','citric acid','residual sugar','']:
 # le = LabelEncoder()
  #le.fit(train_x[c].fillna('NA'))
  
  #train_x[c] = le.transform(train_x[c].fillna('NA'))
  #test_x[c]  = le.transform(test_x[c].fillna('NA'))


#intに変換
#for c in ['fixed acidity', 'volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']:
 # le = LabelEncoder()
  #le.fit(train_x[c].fillna('NA'))
  
  #train_x[c] = le.transform(train_x[c].fillna('NA'))
  #test_x[c]  = le.transform(test_x[c].fillna('NA'))


model = XGBClassifier(n_estimatiors=20,random_state=7)
model.fit(train_x,train_y)

pred_xgb = model.predict_proba(test_x)[:,1]


# -----------------------------------
# ロジスティック回帰用の特徴量の作成
# -----------------------------------
from sklearn.preprocessing import OneHotEncoder

# 元データをコピーする
train_x2 = pretrain.drop(['quality'], axis=1)
test_x2 = pretest.copy()

'''
# one-hot encodingを行う
cat_cols = ['fixed acidity', 'volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
ohe = OneHotEncoder(categories='auto', sparse=False)
ohe.fit(train_x2[cat_cols].fillna('NA'))

# one-hot encodingのダミー変数の列名を作成する
ohe_columns = []
for i, c in enumerate(cat_cols):
    ohe_columns += [f'{c}_{v}' for v in ohe.categories_[i]]

# one-hot encodingによる変換を行う
ohe_train_x2 = pd.DataFrame(ohe.transform(train_x2[cat_cols].fillna('NA')), columns=ohe_columns)
ohe_test_x2 = pd.DataFrame(ohe.transform(test_x2[cat_cols].fillna('NA')), columns=ohe_columns)

# one-hot encoding済みの変数を除外する
train_x2 = train_x2.drop(cat_cols, axis=1)
test_x2 = test_x2.drop(cat_cols, axis=1)

# one-hot encodingで変換された変数を結合する
train_x2 = pd.concat([train_x2, ohe_train_x2], axis=1)
test_x2 = pd.concat([test_x2, ohe_test_x2], axis=1)
'''

# -----------------------------------
# アンサンブル
# -----------------------------------

# ロジスティック回帰モデル
# xgboostモデルとは異なる特徴量を入れる必要があるので、別途train_x2, test_x2を作成した
model_lr = LogisticRegression(solver='lbfgs', max_iter=300)
model_lr.fit(train_x2, train_y)
pred_lr = model_lr.predict_proba(test_x2)[:, 1]

# 予測値の加重平均をとる
pred = (pred_xgb * 0.8 + pred_lr * 0.2)*100
pred_label = pred.astype('int8')

submission = pd.DataFrame({'Id':test_x['Id'], 'quality':pred_label})
submission.to_csv('/content/ans2.csv',index=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#@title Playground Series Season 3, Episode 5
import csv
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split # データセット分割用
from sklearn.metrics import accuracy_score
from pylab import rcParams
from sklearn.metrics import confusion_matrix
np.set_printoptions(suppress=True)#指数表記禁止

import IPython
def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# 特徴量重要度を棒グラフでプロットする関数 
def plot_feature_importance(df): 
  n_features = len(df)                              # 特徴量数(説明変数の個数) 
  df_plot = df.sort_values('importance')            # df_importanceをプロット用に特徴量重要度を昇順ソート 
  f_importance_plot = df_plot['importance'].values  # 特徴量重要度の取得 
  plt.barh(range(n_features), f_importance_plot, align='center') 
  cols_plot = df_plot['feature'].values             # 特徴量の取得 
  plt.yticks(np.arange(n_features), cols_plot)      # x軸,y軸の値の設定
  plt.xlabel('Feature importance')                  # x軸のタイトル
  plt.ylabel('Feature')                             # y軸のタイトル


#データよみこみ
pretest  = pd.read_csv(filepath_or_buffer="/content/test.csv")
pretrain = pd.read_csv(filepath_or_buffer="/content/train.csv")

#扱いづらいデータを削除
traindata = pretrain.drop(['Id','pH','chlorides','fixed acidity'], axis=1)
testdata  = pretest.drop(['Id','pH','chlorides','fixed acidity'], axis=1)

#欠損値処理
#np.nan_to_num(traindata['Age'], copy=False)
#np.nan_to_num(testdata['Age'], copy=False)

#male or female
#SEX_le = LabelEncoder()
#traindata['Sex'] = SEX_le.fit_transform(traindata['Sex'])
#testdata['Sex']  = SEX_le.fit_transform(testdata['Sex'])

#embarked
#eb_le = LabelEncoder()
#traindata['Embarked'] = eb_le.fit_transform(traindata['Embarked'])
#testdata['Embarked']  = eb_le.fit_transform(testdata['Embarked'])

#Ageを整数型に変換
#traindata['Age'] = traindata['Age'].astype('int')
#testdata['Age']  = testdata['Age'].astype('int')


# 説明変数,目的変数, テストケース分割
X = traindata.drop('quality',axis=1).values # 説明変数
y = traindata['quality'].values             # 目的変数
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10, random_state=2) 
#↑学習用データ：テストデータ=8:2になるようにしている



# 学習に使用するデータを設定
putilgb_train = lgb.Dataset(X_train, y_train)
putilgb_eval = lgb.Dataset(X_test, y_test, reference=putilgb_train) 


# LightGBM parameters
params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective':'multiclass',
        'metric': {'multi_logloss'}, # 評価指標 : rsme(平均二乗誤差の平方根) 
        'num_class': 9,
        'learning_rate': 0.01,
        'num_iterations':500,  
        'num_leaves':30,
}



# モデルの学習
model = lgb.train(params,
                  train_set=putilgb_train, # トレーニングデータの指定
                  num_boost_round=1000,
                  valid_sets=putilgb_eval, # 検証データの指定
                  early_stopping_rounds=20,
                  verbose_eval=0
                  )



pred = model.predict(testdata)

pred   = pred.astype('int')


submission = pd.DataFrame({'Id':pretest['Id'], 'quality':pred})
submission.to_csv('/content/ans3.csv',index=False)


#pre_ans = np.stack([pretest["Id"], pred], 1)
#print(pre_ans)

#ans = pd.DataFrame(pre_ans, columns=['Id', 'quality'])
#print(ans)


#ans = ans.drop(ans.columns[[0]] , axis=1)
#ans.to_csv('/content/ans2.csv')

#np.savetxt('/content/ans.csv', pre_ans, delimiter=',' , fmt='%d')



In [None]:
#@title Playground Series - Season 3, Episode 5 Part3

import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb 
import lightgbm as lgbm
import optuna
from optuna.samplers import TPESampler

path = '/content/'
train = pd.read_csv(path +'train.csv').drop('Id',axis=1)
target = 'quality'
features = [c for c in train.columns if c not in ['id','Time', target]]

#combine with original training set
orig_train = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
orig_train = orig_train[~orig_train.duplicated()]
train = pd.concat([train,orig_train]).reset_index(drop=True)
train['split']= 'train'
test = pd.read_csv(path +'test.csv').drop('Id',axis=1)
test['split'] = 'test'
data = pd.concat([train,test]).reset_index(drop=True)
sub = pd.read_csv(path +'sample_submission.csv')