In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_addons as tfa

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_set = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
sub_set = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
train_set.columns, sub_set.columns

In [None]:
X = train_set.text.values
Xsub = sub_set.text.values
y = train_set.target.values
Xall = np.concatenate([X, Xsub], axis=0)

In [None]:
import nltk; from nltk.corpus import stopwords; from nltk.stem.porter import PorterStemmer
nltk.download('stopwords'); all_stopwords = stopwords.words('english'); all_stopwords.remove('not')
import re; from sklearn.feature_extraction.text import CountVectorizer; corpus = []
for i in range(0, Xall.shape[0]):
    stemmer = PorterStemmer()
    tweet = re.sub('[^a-zA-Z]', ' ', Xall[i])
    tweet = tweet.lower()
    tweet = tweet.split()
    tweet = [stemmer.stem(word) for word in tweet if not word in set(all_stopwords)]
    tweet = ' '.join(tweet)
    corpus.append(tweet)
vectorizer = CountVectorizer(max_features = 3000)
textColumn = vectorizer.fit_transform(corpus).toarray()

In [None]:
X = textColumn[:y.shape[0], :]
Xsub = textColumn[y.shape[0]:, :]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 777)

ann = tf.keras.models.Sequential(layers=[
    tf.keras.layers.Dense(100, 'relu'),
    tf.keras.layers.Dropout(.86),
    tf.keras.layers.Dense(135, 'linear'),
    
    tf.keras.layers.Dense(1, 'sigmoid')
])
ann.compile(optimizer='nadam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
def lr_sch(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)
history = ann.fit(X_train, y_train,
       epochs=100, batch_size=200,
       validation_data=(X_test, y_test),
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True),
            tf.keras.callbacks.LearningRateScheduler(lr_sch)
        ]
)

In [None]:
loss = history.history['loss']
acc = history.history['accuracy']
vloss = history.history['val_loss']
vacc = history.history['val_accuracy']

fig, axs = plt.subplots(2, 2, figsize=(20, 20))
axs[0, 0].plot(loss, color='red', label='loss')
axs[1, 0].plot(vloss, color='orange', label='vloss')
axs[0, 0].legend()
axs[1, 0].legend()

axs[0, 1].plot(acc, color='blue', label='acc')
axs[1, 1].plot(vacc, color='cyan', label='vacc')
axs[0, 1].legend()
axs[1, 1].legend()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score, f1_score
y_pred = ann.predict(X_test)>=.5
print("acuracy score:", accuracy_score(y_test, y_pred))
print("f1 score:", f1_score(y_test, y_pred))

In [None]:
from sklearn.svm import SVC, NuSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 777)

classifiers = [('nusvc', NuSVC()), ('svc', SVC()), ('logistic', LogisticRegression()), ('benoulli', BernoulliNB()), ('complement', ComplementNB()), ('gaussian', GaussianNB()), ('mıltinomial', MultinomialNB())]
for name, classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print((name, accuracy_score(y_test, y_pred), '\n', confusion_matrix(y_test, y_pred)))

In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X, y)
ypred = model.predict(Xsub)

In [None]:
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission.iloc[:, 1] = ypred
submission.to_csv('submission.csv', index=False)