In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
train.head()

In [None]:
cat_df = train[['Transaction-Type','Company-response','Consumer-disputes']]

In [None]:
cat_df['Company-response'].fillna('No Response',inplace=True)
cat_df['Consumer-disputes'].fillna('No',inplace=True)

In [None]:
cat_df['Transaction-Type'] = cat_df['Transaction-Type'].astype('category')
cat_df['Company-response'] = cat_df['Company-response'].astype('category')
cat_df['Consumer-disputes'] = cat_df['Consumer-disputes'].astype('category')

In [None]:
cat_df_onehot = cat_df.copy()
cat_df_onehot = pd.get_dummies(cat_df_onehot, columns=['Transaction-Type','Company-response','Consumer-disputes'], 
                               prefix = ['Transaction','Response','Disputes'])

In [None]:
cat_df_onehot.head()

In [None]:
cat_df_onehot.isnull().sum()

In [None]:
text_df = train['Complaint-reason']

In [None]:
text_df.isnull().sum()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
text_df = pd.DataFrame(text_df)

In [None]:
text_df.columns

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [None]:
text_df_counts = text_df['Complaint-reason'].value_counts()

In [None]:
text_df_counts['Incorrect information on credit report']

In [None]:
def text_features(df,df_counts):
    df['word_count'] = df['Complaint-reason'].apply(lambda x: len(str(x).split(" ")))
    df['char_count'] = df['Complaint-reason'].str.len()
    df['avg_word'] = df['Complaint-reason'].apply(lambda x: avg_word(x))
    df['stopwords'] = df['Complaint-reason'].apply(lambda x: len([x for x in x.split() if x in stop]))
    df['sentence_freq'] = df['Complaint-reason'].apply(lambda x: df_counts[x]/len(df))                        

In [None]:
text_features(text_df,text_df_counts)

In [None]:
text_df.head()

In [None]:
text_df_encoded = text_df.drop('Complaint-reason',axis=1)

In [None]:
text_df_encoded.head()

In [None]:
text_df_encoded.reset_index(drop=True, inplace=True)
cat_df_onehot.reset_index(drop=True, inplace=True)

In [None]:
text_df_encoded.shape

In [None]:
cat_df_onehot.shape

In [None]:
final_df = pd.concat([cat_df_onehot,text_df_encoded],axis=1)

In [None]:
final_df.head()

In [None]:
final_df.isnull().sum()

In [None]:
from datetime import date

In [None]:
train['Date-received'] = pd.to_datetime(train['Date-received'])
train['Date-sent-to-company'] = pd.to_datetime(train['Date-sent-to-company'])
train['Days-lag'] = train['Date-sent-to-company'] - train['Date-received']
train['Days-lag'] = train['Days-lag'].apply(lambda x:x.days)

In [None]:
train['Days-lag'].head()

In [None]:
num_df = train['Days-lag']

In [None]:
final_df = pd.concat([final_df,num_df],axis=1)

In [None]:
final_df.head()

In [None]:
train['Complaint-Status'] = train['Complaint-Status'].astype('category')
target = train['Complaint-Status'].cat.codes

In [None]:
target.head()

In [None]:
target.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(final_df, target, test_size=0.33, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=8,random_state=0)

In [None]:
clf.fit(X_train, y_train)

In [None]:
prediction = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = accuracy_score(y_test, prediction)

In [None]:
accuracy

In [None]:
train_pred = clf.predict(X_train)

In [None]:
acc = accuracy_score(y_train, train_pred)

In [None]:
acc

In [None]:
sample_sub = pd.read_csv('../input/sample_submission.csv')

In [None]:
sample_sub.head()

In [None]:
test = pd.read_csv('../input/test.csv')

In [None]:
test.head()

In [None]:
cat_df_test = test[['Transaction-Type','Company-response','Consumer-disputes']]

In [None]:
cat_df_test['Company-response'].fillna('No Response',inplace=True)
cat_df_test['Consumer-disputes'].fillna('No',inplace=True)

In [None]:
cat_df_test['Transaction-Type'] = cat_df_test['Transaction-Type'].astype('category')
cat_df_test['Company-response'] = cat_df_test['Company-response'].astype('category')
cat_df_test['Consumer-disputes'] = cat_df_test['Consumer-disputes'].astype('category')

In [None]:
cat_df_onehot2 = cat_df_test.copy()
cat_df_onehot2 = pd.get_dummies(cat_df_onehot2, columns=['Transaction-Type','Company-response','Consumer-disputes'], 
                               prefix = ['Transaction','Response','Disputes'])

In [None]:
text_df2 = test['Complaint-reason']

In [None]:
type(text_df2)

In [None]:
text_df2 = pd.DataFrame(text_df2)

In [None]:
type(text_df2)

In [None]:
text_df2_counts = text_df2['Complaint-reason'].value_counts()

In [None]:
text_features(text_df2,text_df2_counts)

In [None]:
text_df2.head()

In [None]:
text_df_encoded2 = text_df2.drop('Complaint-reason',axis=1)

In [None]:
text_df_encoded2.head()

In [None]:
text_df_encoded2.reset_index(drop=True, inplace=True)
cat_df_onehot2.reset_index(drop=True, inplace=True)

In [None]:
final_df = pd.concat([cat_df_onehot2,text_df_encoded2],axis=1)

In [None]:
final_df.head()

In [None]:
from datetime import date

In [None]:
test['Date-received'] = pd.to_datetime(test['Date-received'])
test['Date-sent-to-company'] = pd.to_datetime(test['Date-sent-to-company'])
test['Days-lag'] = test['Date-sent-to-company'] - test['Date-received']
test['Days-lag'] = test['Days-lag'].apply(lambda x:x.days)

In [None]:
num_df2 = test['Days-lag']

In [None]:
final_df = pd.concat([final_df,num_df2],axis=1)

In [None]:
test_pred = clf.predict(final_df)

In [None]:
test['Complaint-Status'] = test_pred

In [None]:
test.head()

In [None]:
def convert_labels(x):
    if x==0:
        return 'Closed'
    if x==1:
        return 'Closed with explanation'
    if x==2:
        return 'Closed with monetary relief'
    if x==3:
        return 'Closed with non-monetary relief'
    if x==4:
        return 'Untimely response'

In [None]:
test['Complaint-Status'] = test['Complaint-Status'].apply(convert_labels)

In [None]:
sub_test = test[['Complaint-ID','Complaint-Status']]

In [None]:
sub_test.head()

In [None]:
sub_test.to_csv('sub.csv',index=False)