In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 


In [None]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [None]:
train.head()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.label.unique()

In [None]:
#Data Preprocessing 
# a mapping dictionary that maps the label values from 0 to 3
label_mapping = {
"Depression": 0,
"Drugs": 1,
"Suicide": 2,
"Alcohol": 3
}

train["label"] = train.label.map(label_mapping)

train.head()

In [None]:

train.label.value_counts().plot.barh()

In [None]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
trainc= train.copy()
testc=test.copy()

In [None]:
import string
import re

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


# Applying the cleaning function to both test and training datasets
train['text'] = train['text'].apply(lambda x: clean_text(x))
test['text'] = test['text'].apply(lambda x: clean_text(x))

# Let's take a look at the updated text
train['text'].head()

In [None]:
# Tokenizing the training and the test set
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
train['text'] = train['text'].apply(lambda x: tokenizer.tokenize(x))
test['text'] = test['text'].apply(lambda x: tokenizer.tokenize(x))
train['text'].head()

In [None]:
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords
def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words


train['text'] = train['text'].apply(lambda x : remove_stopwords(x))
test['text'] = test['text'].apply(lambda x : remove_stopwords(x))
train.head()


In [None]:
# After preprocessing, the text format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text
train['text']=train['text'].apply(lambda x: combine_text(x))
test['text'] = test['text'].apply(lambda x : combine_text(x))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2))
train_vectors = tfidf.fit_transform(train['text'])
test_vectors = tfidf.transform(test["text"])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

In [None]:
#split our data into train and test
from sklearn.model_selection import train_test_split

#split features and target from train data 
X = train_vectors
y = train.label.values

In [None]:

# split data into train and validate

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.20,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [None]:

# Create a classifier
news_classifier = MultinomialNB()

# train the news_classifier 
news_classifier.fit(X_train,y_train)
# test model performance on valid data 
y_probas = news_classifier.predict_proba(X_valid)
# evalute model performance by using log_loss in the validation data
log_loss(y_valid, y_probas)

In [None]:

rfr=RandomForestClassifier(n_estimators=150, max_depth=6, random_state=0)
rfr.fit(X_train,y_train)
rfr_probas= rfr.predict_proba(X_valid)
log_loss(y_valid,rfr_probas)

In [None]:

xgb=XGBClassifier()
xgb.fit(X_train,y_train)
xgb_probas= xgb.predict_proba(X_valid)
log_loss(y_valid,xgb_probas)

In [None]:
#logistic regression
logr=LogisticRegression(max_iter=150,C=7, random_state=23)
logr.fit(X_train,y_train)
logr_probas= logr.predict_proba(X_valid)
log_loss(y_valid,logr_probas)

In [None]:

# create prediction from the test data
xgb_probass =xgb.predict_proba(test_vectors)

In [None]:
test.head()

In [None]:
sub.head()

In [None]:
# create submission file 
submission_cols = ['Depression', 'Alcohol', 'Suicide', 'Drugs'] 
submission_df = pd.DataFrame(xgb_probass, columns = submission_cols)
submission_df['ID'] = test['ID']   # add  test_id 

#rearange columns 
submission_df = submission_df[['ID','Depression', 'Alcohol', 'Suicide', 'Drugs']]

# save submission file 
submission_df.to_csv("first_submission.csv",index=False)

In [None]:
submission_df.head()