In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
train_df=pd.read_csv("../input/nlp-getting-started/train.csv")
test_df=pd.read_csv("../input/nlp-getting-started/test.csv")



In [None]:
train_df.isnull().sum()

In [None]:
X = train_df.drop(["id"],axis=1)
X.head()

In [None]:
key = X["keyword"].value_counts().index[0]
print(key)
loc = X["location"].value_counts().index[0]
print(loc)

In [None]:
train_df['keyword'] = train_df['keyword'].fillna(train_df['keyword'].value_counts().idxmax())
train_df['location'] = train_df['location'].fillna(train_df['location'].value_counts().idxmax())
train_df.head()


In [None]:
test_df['keyword'] = test_df['keyword'].fillna(test_df['keyword'].value_counts().idxmax())
test_df['location'] = test_df['location'].fillna(test_df['location'].value_counts().idxmax())
test_df.isnull().sum()


In [None]:
X=train_df.iloc[:,1:4]
y=train_df.iloc[:,4:]
X["text"]=X["text"].apply(lambda x:" ".join(x.lower() for x in x.split())) # Lower words
X["text"]=X["text"].str.replace("[^\w\s]"," ") # Clear ".,!#"
X["text"]=X["text"].str.replace("\d"," ") # Clear Number
X["text"]=X["text"].str.replace("https"," ") # Clear https

# Stopwords
sw=stopwords.words("english")
X["text"]=X["text"].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
# Lemmi
from textblob import Word
nltk.download("wordnet")
X["text"]=X["text"].apply(lambda x:" ".join([Word(x).lemmatize() for x in x.split()]))
X["text"][:5]



In [None]:
# For Test Data
test_x=test_df.iloc[:,1:4]
test_y=test_df.iloc[:,4:]

test_x["text"]=test_x["text"].apply(lambda x:" ".join(x.lower() for x in x.split() ))
test_x["text"]=test_x["text"].str.replace("[^\w\s]"," ")
test_x["text"]=test_x["text"].str.replace("\d"," ")
test_x["text"]=test_x["text"].apply(lambda x:" ".join(x for x in x.split() if x not in sw))
test_x["text"]=test_x["text"].str.replace("https"," ") # Clear https

#Lemmi
test_x["text"]=test_x["text"].apply(lambda x:" ".join([Word(x).lemmatize() for x in x.split()]))
test_x["text"][:5]




In [None]:
X['sentence']=X['keyword']+" "+X["text"]
train_text = X['sentence'].values
print(train_text)



In [None]:
test_x['sentence']=test_x['keyword']+" "+test_x["text"]
test_text=test_x['sentence'].values
print(test_text)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(train_text,y,test_size=0.2,random_state=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
vectorizer.fit(x_train)
print(x_train)
x_train_tfidf=vectorizer.transform(x_train).toarray()
x_test_tfidf=vectorizer.transform(x_test).toarray()
test_last=vectorizer.transform(test_text).toarray()





In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

bn=BernoulliNB(alpha=0.2)
bn.fit(x_train_tfidf,y_train)
y_pred=bn.predict(x_test_tfidf)
cm=confusion_matrix(y_test,y_pred)
print(cm)
f1=f1_score(y_test,y_pred)
print(f1)
cr=classification_report(y_test,y_pred)
print(cr)
accuracy=cross_val_score(bn,x_test_tfidf,y_test,cv=10).mean()
print(accuracy)


In [None]:
prediction1=bn.predict(test_last)
data = {'id':test_df["id"],'target':prediction1}
output = pd.DataFrame(data, columns = ['id','target'])
output.index = test_df.index

output.to_csv("submission.csv", index = False)    


a = pd.read_csv("submission.csv")
a

# USİNG  CATBOOST


In [None]:
pip install catboost

In [None]:
from catboost import CatBoostClassifier

cb=CatBoostClassifier()
cb.fit(x_train_tfidf,y_train)
y_pred=cb.predict(x_test_tfidf)
cm=confusion_matrix(y_test,y_pred)
print(cm)
f1=f1_score(y_test,y_pred)
print(f1)
cr=classification_report(y_test,y_pred)
print(cr)
accuracy=cross_val_score(bn,x_test_tfidf,y_test,cv=10).mean()
print(accuracy)

In [None]:
catb_params={"iterations":[200,500,750],
             "learning_rate":[0.01,0.05,0.1],
             "depth":[3,5,8]                  
            }
from sklearn.model_selection import GridSearchCV

In [None]:
cat_b=CatBoostClassifier()
catb_model=GridSearchCV(cat_b,catb_params,cv=5,n_jobs=1,verbose=2)
catb_model.fit(x_train_tfidf,y_train)


In [None]:
catb_model.best_params_

In [None]:
cb=CatBoostClassifier(iterations=750,learning_rate=0.05,depth=5)
cb.fit(x_train_tfidf,y_train)
y_pred=cb.predict(x_test_tfidf)
cm=confusion_matrix(y_test,y_pred)
print(cm)
f1=f1_score(y_test,y_pred)
print(f1)
cr=classification_report(y_test,y_pred)
print(cr)
accuracy=cross_val_score(bn,x_test_tfidf,y_test,cv=10).mean()
print(accuracy)

In [None]:
prediction1=bn.predict(test_last)
data = {'id':test_df["id"],'target':prediction1}
output = pd.DataFrame(data, columns = ['id','target'])
output.index = test_df.index

output.to_csv("submission.csv", index = False)    


a = pd.read_csv("submission.csv")
a