In [None]:
import pandas as pd  # data processing
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split  # data split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from mp_api import MPRester

In [None]:
"""
_______________________________________________________
Problem 2: Credit Card Fraud
_______________________________________________________
"""

df = pd.read_csv('creditcard.csv')
df.drop('Time', axis=1, inplace=True)  # deletes the row labeled time

print(df.head())

cases = len(df)
nonfraud_count = len(df[df.Class == 0])
fraud_count = len(df[df.Class == 1])
fraud_percentage = round(fraud_count / nonfraud_count * 100, 2)

print('CASE COUNT')
print('--------------------------------------------')
print('Total number of cases are {}'.format(cases))
print('Number of Non-fraud cases are {}'.format(nonfraud_count))
print('Number of fraud cases are {}'.format(fraud_count))
print('Percentage of fraud cases is {}%'.format(fraud_percentage))
print('--------------------------------------------')

# views transaction amount data for both fraud and non-fraud
nonfraud_cases = df[df.Class == 0]
fraud_cases = df[df.Class == 1]

print('CASE AMOUNT STATISTICS')
print('--------------------------------------------')
print('NON-FRAUD CASE AMOUNT STATS')
print(nonfraud_cases.Amount.describe())
print('--------------------------------------------')
print('FRAUD CASE AMOUNT STATS')
print(fraud_cases.Amount.describe())
print('--------------------------------------------')

df.reset_index()
# Use standard scaler to normalize the two sets to make them closer to each other
sc = StandardScaler()
amount = df['Amount'].values

df['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

print(df['Amount'].head(10))

# train_test_split
# x = everything but class, y = class

X = df.drop('Class', axis=1).values
y = df['Class'].values
df.reset_index()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('X_train samples : ')
print(X_train[:1])
print('X_test samples : ')
print(X_test[0:1])
print('y_train samples : ')
print(y_train[0:20])
print('y_test samples : ')
print(y_test[0:20])

# MODELING

df.reset_index()
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# 1. Decision Tree

tree_model = DecisionTreeClassifier(max_depth=4, criterion='entropy')
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)

# 2. K-Nearest Neighbors

# n = 5

# knn = KNeighborsClassifier(n_neighbors=n)
# knn.fit(X_train, y_train)
# knn_yhat = knn.predict(X_test)

# # 3. Logistic Regression

# lr = LogisticRegression()
# lr.fit(X_train, y_train)
# lr_yhat = lr.predict(X_test)

# # 4. SVM

# svm = SVC()
# svm.fit(X_train, y_train)
# svm_yhat = svm.predict(X_test)

# # 5. Random Forest Tree

# rf = RandomForestClassifier(max_depth=4)
# rf.fit(X_train, y_train)
# rf_yhat = rf.predict(X_test)

#print acurracy scores
# 1. Accuracy score

print('ACCURACY SCORE')
print('------------------------------------------------------------------------')
print('Accuracy score of the Decision Tree model is {}'.format(accuracy_score(y_test, tree_yhat)))
print('------------------------------------------------------------------------')
#print('Accuracy score of the KNN model is {}'.format(accuracy_score(y_test, knn_yhat)))
#print('------------------------------------------------------------------------')
# print('Accuracy score of the Logistic Regression model is {}'.format(accuracy_score(y_test, lr_yhat)))
# print('------------------------------------------------------------------------')
# print('Accuracy score of the SVM model is {}'.format(accuracy_score(y_test, svm_yhat)))
# print('------------------------------------------------------------------------')
# print('Accuracy score of the Random Forest Tree model is {}'.format(accuracy_score(y_test, rf_yhat)))
# print('------------------------------------------------------------------------')


In [None]:
"""
_______________________________________________________
Problem 2: Email Spam
_______________________________________________________
"""

spam = pd.read_csv('spam.csv')

text = spam['v2']
label = spam["v1"]
text_train, text_test, label_train, label_test = train_test_split(text, label, test_size=0.2)

# Counts words and occurrences in each email
cv = CountVectorizer()
features = cv.fit_transform(text_train)

# Support vector machine algorithm
# separates data into classes
model = svm.SVC()
model.fit(features, label_train)

features_test = cv.transform(text_test)
print("Accuracy: {}".format(model.score(features_test, label_test)))


In [None]:
"""
_______________________________________________________
Problem 1: Material Project
_______________________________________________________
"""

