# Data Cleaning ad Splitting

In [1]:
import pandas as pd

from nltk import word_tokenize
from nltk.stem import PorterStemmer
import re

from sklearn.model_selection import train_test_split

In [2]:
# reading data
data = pd.read_csv('all_tickets.csv')

In [3]:
# viewing top 5 records
data.head()

Unnamed: 0,title,body,ticket_type,category,sub_category1,sub_category2,business_service,urgency,impact
0,,hi since recruiter lead permission approve req...,1,4,2,21,71,3,4
1,connection with icon,icon dear please setup icon per icon engineers...,1,6,22,7,26,3,4
2,work experience user,work experience user hi work experience studen...,1,5,13,7,32,3,4
3,requesting for meeting,requesting meeting hi please help follow equip...,1,5,13,7,32,3,4
4,reset passwords for external accounts,re expire days hi ask help update passwords co...,1,4,2,76,4,3,4


In [4]:
# shape of data
data.shape

(48549, 9)

In [5]:
# checking for null values
data.isna().sum()

title               712
body                  0
ticket_type           0
category              0
sub_category1         0
sub_category2         0
business_service      0
urgency               0
impact                0
dtype: int64

In [6]:
# filling null values with empty string
data.fillna('', inplace=True)

In [7]:
# combining title and body columns into info
data['info'] = data['title'] + ' ' + data['body']

In [8]:
# dropping title and body columns
data.drop(columns=['title','body'], inplace=True)

In [10]:
# porter stemmer instance
stemmer = PorterStemmer()

In [12]:
# cleaning info column by removing punctuation, numbers and extra white spaces
data['info'] = data['info'].apply(lambda info : ' '.join([stemmer.stem(words.lower().strip()) for words in word_tokenize(str(re.sub('[^A-Za-z]+', ' ', info)))]))

# Splitting Data into training, validating and testing sets

Splitting data into 3 cateories (training set, validation set and testing set) into 8:1:1 ratio. 

In [13]:
train, test_and_valid = train_test_split(data, train_size=0.8, random_state=35, stratify=data['ticket_type'])

In [14]:
test, valid = train_test_split(test_and_valid, train_size=0.5, random_state=35, stratify=test_and_valid['ticket_type'])

In [15]:
train.shape, valid.shape, test.shape

((38839, 8), (4855, 8), (4855, 8))

**Writing it back to CSV files**

In [16]:
train.to_csv('ticket_train.csv', index=False)

In [17]:
test.to_csv('ticket_test.csv', index=False)

In [18]:
valid.to_csv('ticket_valid.csv', index=False)