In [3]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("/content/Claim_train_data.csv",names = ["description","claimsType"])
data.head()

Unnamed: 0,description,claimsType
0,description,claimType
1,water is currently leaking through ceiling in ...,ESCAPEWATER
2,Leak from kitchen ceiling,ESCAPEWATER
3,I was holding my phone and as I turned round I...,ADSPECIFIED
4,Apparent leak to side of shower unit as discov...,ESCAPEWATER


In [5]:
data.shape

(5716, 2)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5716 entries, 0 to 5715
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  5716 non-null   object
 1   claimsType   5716 non-null   object
dtypes: object(2)
memory usage: 89.4+ KB


In [7]:
data.isna().sum()

description    0
claimsType     0
dtype: int64

In [8]:
data.dropna(inplace=True)

In [9]:

data.shape

(5716, 2)

In [10]:
data.describe()

Unnamed: 0,description,claimsType
count,5716,5716
unique,5687,28
top,Awaiting description of loss. Notified offline...,ESCAPEWATER
freq,11,1561


In [11]:
data.head()

Unnamed: 0,description,claimsType
0,description,claimType
1,water is currently leaking through ceiling in ...,ESCAPEWATER
2,Leak from kitchen ceiling,ESCAPEWATER
3,I was holding my phone and as I turned round I...,ADSPECIFIED
4,Apparent leak to side of shower unit as discov...,ESCAPEWATER


In [12]:
data['claimsType'].value_counts()

ESCAPEWATER       1561
STORM              776
ADCONTENTS         562
ADSPECIFIED        379
OTHER              296
BURGLARY           293
THEFTSPECIFIED     238
BLOCKEDDRAINS      233
ADBUILDINGS        230
BOILER             215
BURSTPIPE          180
LOSSSPECIFIED      161
SUBSIDENCE         155
FIRE                72
IMPACT              68
VANDALISM           51
LOSTKEY             42
FREEZER             39
FLOOD               38
TREEFALL            37
WIRINGFAULT         36
PLPROPERTY          25
ESCAPEOIL            9
PLINJURY             8
LANDSLIP             5
LIGHTNING            4
EXPLOSION            2
claimType            1
Name: claimsType, dtype: int64

Text Preprocessing

In [13]:
import nltk

In [14]:
data["description"] = data["description"].str.lower()
data.head()

Unnamed: 0,description,claimsType
0,description,claimType
1,water is currently leaking through ceiling in ...,ESCAPEWATER
2,leak from kitchen ceiling,ESCAPEWATER
3,i was holding my phone and as i turned round i...,ADSPECIFIED
4,apparent leak to side of shower unit as discov...,ESCAPEWATER


In [15]:
data['description'][1]

'water is currently leaking through ceiling in kitchen.  bathroom is room above. im unable to isolate the exact cause of the leak, but i think it may be coming from a radiator pipe.  bathroom above is tiled, so underfloor pipes cannot be accessed.'

In [16]:
#function to remove the punctuation
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data["description"] = data["description"].apply(lambda text: remove_punctuation(text))
data.head()

Unnamed: 0,description,claimsType
0,description,claimType
1,water is currently leaking through ceiling in ...,ESCAPEWATER
2,leak from kitchen ceiling,ESCAPEWATER
3,i was holding my phone and as i turned round i...,ADSPECIFIED
4,apparent leak to side of shower unit as discov...,ESCAPEWATER


In [17]:
data['description'][1]

'water is currently leaking through ceiling in kitchen  bathroom is room above im unable to isolate the exact cause of the leak but i think it may be coming from a radiator pipe  bathroom above is tiled so underfloor pipes cannot be accessed'

In [18]:
import nltk
nltk.download('stopwords')
import string
from nltk.corpus import stopwords

def process_text(text):
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc = ''.join(no_punc)
    
    
    return ' '.join([word for word in no_punc.split() if word.lower() not in stopwords.words('english')])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
data['description']=data['description'].apply(process_text)

In [20]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stemming (text):
    return ''.join([stemmer.stem(word) for word in text])

data['description']=data['description'].apply(stemming)
data.head()    

Unnamed: 0,description,claimsType
0,description,claimType
1,water currently leaking ceiling kitchen bathro...,ESCAPEWATER
2,leak kitchen ceiling,ESCAPEWATER
3,holding phone turned round accidentally knocke...,ADSPECIFIED
4,apparent leak side shower unit discovered toda...,ESCAPEWATER


In [21]:
data['description']

0                                             description
1       water currently leaking ceiling kitchen bathro...
2                                    leak kitchen ceiling
3       holding phone turned round accidentally knocke...
4       apparent leak side shower unit discovered toda...
                              ...                        
5711    chris bell c r electrical began work fri 16th ...
5712                        water leak outside brick wall
5713    water dripping 1 top floor extension wall lamp...
5714            heating system breakdown oil fired boiler
5715    emergency home cover water pipe connecting was...
Name: description, Length: 5716, dtype: object

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))

X = cv.fit_transform(data['description'])

In [23]:
X.shape

(5716, 5000)

In [24]:
y = data['claimsType']

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [44]:
cv.get_feature_names_out()[150:300]

array(['also caused', 'also checked', 'also damage', 'also damaged',
       'also due', 'also found', 'also leaking', 'also lifted',
       'also lost', 'also need', 'also noticed', 'also noticed water',
       'also video', 'also water', 'also wet', 'although', 'always',
       'amazon', 'ambulance', 'american', 'american fridge', 'amount',
       'amount water', 'amsterdam', 'angle', 'anglian', 'anglian water',
       'annex', 'another', 'another part', 'anymore', 'anyone',
       'anything', 'anywhere', 'apart', 'apparent', 'apparently',
       'appear', 'appeared', 'appeared ceiling', 'appearing',
       'appearing kitchen', 'appears', 'appears coming', 'appears leak',
       'appears leaking', 'appears water', 'apple', 'apple ipad',
       'apple mac', 'apple macbook', 'apple macbook pro', 'apple store',
       'apple watch', 'appliance', 'appliances', 'appointed',
       'appointment', 'appreciate', 'approved', 'approx', 'approximately',
       'april', 'area', 'area causing', 'a

In [45]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [46]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb.fit(X_train,y_train)

In [47]:
y_pred = nb.predict(X_test)

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

                precision    recall  f1-score   support

   ADBUILDINGS       0.46      0.30      0.36        40
    ADCONTENTS       0.67      0.78      0.72       122
   ADSPECIFIED       0.47      0.31      0.37        81
 BLOCKEDDRAINS       0.88      0.68      0.77        56
        BOILER       0.80      0.88      0.84        41
      BURGLARY       0.63      0.79      0.70        62
     BURSTPIPE       0.59      0.50      0.54        38
   ESCAPEWATER       0.77      0.92      0.84       314
          FIRE       0.25      0.14      0.18         7
         FLOOD       0.00      0.00      0.00         7
       FREEZER       0.90      0.75      0.82        12
        IMPACT       0.91      0.62      0.74        16
 LOSSSPECIFIED       0.58      0.72      0.65        29
       LOSTKEY       0.00      0.00      0.00         8
         OTHER       0.34      0.21      0.26        56
      PLINJURY       0.00      0.00      0.00         4
    PLPROPERTY       0.00      0.00      0.00  