In [1]:
from sklearn import linear_model
import pandas as pd
df = pd.DataFrame([[2,2],[3,4],[4,6],[6,7]], columns=['x','y'])

In [5]:
model = linear_model.LinearRegression()
model.fit(df[['x']], df['y'])

LinearRegression()

In [8]:
model.predict([[8]])

array([9.97142857])

In [9]:
model.score(df[['x']], df['y'])

0.8953995157384989

In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

model = LogisticRegression()
model.fit(X_train, y_train)
print(model.predict_proba(X_test))
y_pred = model.predict_proba(X_test)[:, 1] > 0.75

print("precision:", precision_score(y_test, y_pred))
print("recall:", recall_score(y_test, y_pred))

[[0.24257122 0.75742878]
 [0.10289769 0.89710231]
 [0.36687895 0.63312105]
 [0.55655216 0.44344784]
 [0.8717151  0.1282849 ]
 [0.95526979 0.04473021]
 [0.82660759 0.17339241]
 [0.97125834 0.02874166]
 [0.88730331 0.11269669]
 [0.74729389 0.25270611]
 [0.85781345 0.14218655]
 [0.7882165  0.2117835 ]
 [0.16350141 0.83649859]
 [0.43593619 0.56406381]
 [0.22869838 0.77130162]
 [0.88344786 0.11655214]
 [0.67683532 0.32316468]
 [0.65549418 0.34450582]
 [0.87350042 0.12649958]
 [0.86708527 0.13291473]
 [0.11111951 0.88888049]
 [0.74962743 0.25037257]
 [0.67011584 0.32988416]
 [0.16069258 0.83930742]
 [0.85740027 0.14259973]
 [0.92253767 0.07746233]
 [0.9113698  0.0886302 ]
 [0.39235058 0.60764942]
 [0.84016863 0.15983137]
 [0.93109134 0.06890866]
 [0.85264941 0.14735059]
 [0.86707211 0.13292789]
 [0.31004867 0.68995133]
 [0.53577338 0.46422662]
 [0.25133091 0.74866909]
 [0.86065702 0.13934298]
 [0.95182917 0.04817083]
 [0.88725149 0.11274851]
 [0.05206244 0.94793756]
 [0.40335825 0.59664175]


In [4]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'

kf = KFold(n_splits=5, shuffle=True)

X1 = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
X2 = df[['Pclass', 'male', 'Age']].values
X3 = df[['Fare', 'Age']].values
y = df['Survived'].values

def score_model(X, y, kf):
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = LogisticRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
    print("accuracy:", np.mean(accuracy_scores))
    print("precision:", np.mean(precision_scores))
    print("recall:", np.mean(recall_scores))
    print("f1 score:", np.mean(f1_scores))

print("Logistic Regression with all features")
score_model(X1, y, kf)
print()
print("Logistic Regression with Pclass, Sex & Age features")
score_model(X2, y, kf)
print()
print("Logistic Regression with Fare & Age features")
score_model(X3, y, kf)

Logistic Regression with all features
accuracy: 0.8004380118072747
precision: 0.7645781565314298
recall: 0.6990831518838543
f1 score: 0.7300619244353259

Logistic Regression with Pclass, Sex & Age features
accuracy: 0.7925918872595696
precision: 0.7492691622103387
recall: 0.7016074743158076
f1 score: 0.7222110650876433

Logistic Regression with Fare & Age features
accuracy: 0.6572779787976893
precision: 0.6493856967620085
recall: 0.2378272070128009
f1 score: 0.34730861005370806


In [4]:
import numpy as np
from sklearn.metrics import adjusted_rand_score
true_labels = np.array([0, 0, 0, 1, 1, 1])
pred_labels = np.array([0, 0, 1, 1, 2, 2])

ari = adjusted_rand_score(true_labels, pred_labels)
print('{}\n'.format(ari))

perf_labels = np.array([0, 0, 0, 1, 1, 1])
ari = adjusted_rand_score(true_labels, perf_labels)
print('{}\n'.format(ari))

# Perfect labeling, permuted
permuted_labels = np.array([1, 1, 1, 0, 0, 0])
ari = adjusted_rand_score(true_labels, permuted_labels)
print('{}\n'.format(ari))

renamed_labels = np.array([1, 1, 1, 3, 3, 3])
# Renamed labels to 1, 3
ari = adjusted_rand_score(true_labels, renamed_labels)
print('{}\n'.format(ari))

true_labels2 = np.array([0, 1, 2, 0, 3, 4, 5, 1])
# Bad labeling
pred_labels2 = np.array([1, 1, 0, 0, 2, 2, 2, 2])
ari = adjusted_rand_score(true_labels2, pred_labels2)
print('{}\n'.format(ari))

0.24242424242424246

1.0

1.0

1.0

-0.12903225806451613

