In [None]:
!pip install urlextract
!pip install spacy
!pip install contractions
!pip install pyenchant
!sudo yum install -y enchant
!sudo yum install -y hunspell-en

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('all')
!python -m spacy download en_core_web_sm

In [1]:
import os
import random
import string
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
from datetime import datetime
import boto3
import dask.dataframe as dd
import tempfile



%matplotlib inline
# %matplotlib notebook


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 5000)
pd.set_option('display.float_format', str)
matplotlib.rcParams['figure.figsize'] = (10, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

working_dir = '/home/ec2-user/SageMaker'
base_dir = '/home/ec2-user/SageMaker/topic_modelling/'
# s3_data_path = 's3://bucket-sushant/bangla-character-recognition/'

## For reproducible results
seed_value = 18
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
# https://stackoverflow.com/questions/5836335/consistently-create-same-random-numpy-array/5837352#5837352
random_state = np.random.RandomState(seed=seed_value)
s3 = boto3.client('s3')

def read_from_s3(file_path):
    bucket_name = file_path.split('/')[2]
    key = '/'.join(file_path.split('/')[3:])
    response = s3.get_object(Bucket=bucket_name, Key=key)
    body = response['Body'].read()
    return body

def read_pickle_from_s3(file_path):
    data = pickle.loads(read_from_s3(file_path))
    return data

# def read_csv_from_s3(file_path):
#     data = pd.read_csv(file_path, low_memory=False)
#     return data

def store_object_to_s3_as_pickle(data, file_path):
    bucket_name = file_path.split('/')[2]
    key = '/'.join(file_path.split('/')[3:])
#     # uses lot of memory
#     pickle_obj = pickle.dumps(data)
#     return s3.put_object(Key=key, Bucket=bucket_name, Body=pickle_obj)
    # using tmp file
    fd, path = tempfile.mkstemp()
    try:
        with open(path, 'wb') as pointer:
            pickle.dump(data, pointer)
        with open(path, "rb") as pointer:
            s3.upload_fileobj(pointer, bucket_name, key)
    finally:
        os.remove(path)

In [2]:
import nltk
from pandas import option_context
import ast
from urlextract import URLExtract
import string

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer 
import spacy
from nltk.corpus import stopwords
import re
import contractions
import enchant

In [3]:
!pwd

/home/ec2-user/SageMaker/topic_modelling/DataCleaning


# Data Load

In [4]:
data = pd.read_csv(f'{base_dir}/Data/EC2_India_ES.csv', encoding='ISO-8859-1', low_memory=False)
print(data.shape)
data.head()
# data.style.set_properties(subset=['comm_body'], **{'width': '300px'})


(14234, 11)


Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity
0,7330879201,Zee Entertainment Enterprises Ltd,Elastic Compute Cloud (EC2 - Windows),APAC,INDIA,abagrech,"{""transcript"":[{""action"":""TRANSCRIPT_START"",""d...",8/31/2020 0:00,8/31/2020 0:00,,2
1,7331080451,Jubilant FoodWorks Limited,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,prshanta,"{""transcript"":[{""action"":""TRANSCRIPT_START"",""d...",8/31/2020 0:00,8/31/2020 0:00,,3
2,7331080451,Jubilant FoodWorks Limited,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,prshanta,"{""transcript"":[{""action"":""TRANSCRIPT_START"",""d...",8/31/2020 0:00,8/31/2020 0:00,,3
3,7331844541,ALL_DEPRECATED,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,shrirodi,"{""transcript"":[{""action"":""TRANSCRIPT_START"",""d...",8/31/2020 0:00,8/31/2020 0:00,,4
4,7331853761,PayU Payments Private Limited,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,ratheed,"{""transcript"":[{""action"":""TRANSCRIPT_START"",""d...",8/31/2020 0:00,8/31/2020 0:00,,3


In [5]:
with option_context('display.max_colwidth', 10000):
    display(data.comm_body.head(1))

0    {"transcript":[{"action":"TRANSCRIPT_START","data":{"chatId":"euwzkkSWQl2q1GorO99S89spRs8ccSaPmbAyDk8dQm4VZSLiLkGooizM3gDs4S01wWq7tAR-Xlc","customerInfo":{"initialQuestion":"Regarding case 7330879201","customerIdentifierToken":"amzn1.token.ABINMLVUBIJCIX2ZTGLNJQSXJSUQ","customerIdentifierType":"E","customerName":"assumed-role/z5x_iam_role_devops_engineer/sampath.sirigiri@zee.esselgroup.com","attributes":{}},"entryPoint":"ACD_AWSEnEc2WindowsTier3Chat","timestamp":1598854208615}},{"action":"PARTICIPANT_CHANGE","data":{"participant":{"userName":"customer","displayName":"assumed-role/z5x_iam_role_devops_engineer/sampath.sirigiri@zee.esselgroup.com","participantId":"1","state":"CONNECTED","preferences":{}},"reconnected":false,"timestamp":1598854210457}},{"action":"PARTICIPANT_CHANGE","data":{"participant":{"userName":"agent","displayName":"Abhinav","participantId":"8","state":"CONNECTED","preferences":{"language":""}},"reconnected":false,"timestamp":1598854368446}},{"action":"PARTICIPA

---

# Parsing dictionaries in the rows if they are there

In [None]:
def check_dict(v):
    try:
        evald = ast.literal_eval(v)
        if isinstance(evald, dict):
            return True
        else:
            return False
    except:
        return None

t = data.comm_body.apply(lambda x: check_dict(x))
t.value_counts(dropna=False)

data.comm_body.str[-10:].head(2140)

Dictionaries seem to be incomplete problem so, ignoring them

---

# Ignoring the dictionaries

Total rows with dictionary are : 2131 

In [6]:
data = data.loc[2131:].copy().reset_index(drop=True)

In [7]:
print(data.shape)
data.head()

(12103, 11)


Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity
0,8415574821,Genpact,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,gupmanav,"Dear Team, I am facing issue in one of the EC...",6/2/2021 0:00,6/2/2021 0:00,1/2 checks EC2,4
1,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4
2,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,while checking the instance the above instance...,2/1/2021 0:00,2/1/2021 0:00,1/2 status check,4
3,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,"Hello, We have looked into the issue and we ...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4
4,7745171561,Tata Communications Ltd.,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,Please let us know if we helped resolve your i...,12/16/2020 0:00,12/26/2020 0:00,2/2 check failed,3


---

# Ignoring Nan in Comm Body

In [8]:
data = data.loc[~data.comm_body.isnull()]

---

# Extracting URLs

In [9]:
print(data.shape)
data.head()

(12099, 11)


Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity
0,8415574821,Genpact,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,gupmanav,"Dear Team, I am facing issue in one of the EC...",6/2/2021 0:00,6/2/2021 0:00,1/2 checks EC2,4
1,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4
2,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,while checking the instance the above instance...,2/1/2021 0:00,2/1/2021 0:00,1/2 status check,4
3,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,"Hello, We have looked into the issue and we ...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4
4,7745171561,Tata Communications Ltd.,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,Please let us know if we helped resolve your i...,12/16/2020 0:00,12/26/2020 0:00,2/2 check failed,3


In [10]:
extractor = URLExtract()
def extract_url_from_data(x):
    try:
        url = extractor.find_urls(x)
        if len(url) > 0:
            return ','.join(url)
        else:
            return None
    except:
        print(x)

In [11]:
# comm_body = data.comm_body.values
# urls = []
# for d in comm_body:
#     urls.append(extract_url_from_data(d))

In [12]:
data['urls'] = data.comm_body.apply(lambda x: extract_url_from_data(x))

data['urls'].value_counts(dropna=False)

NaN                                                                                                                                                                                                                                                                                                                                                                                     5181
Viatris.com                                                                                                                                                                                                                                                                                                                                                                                7
0.0.0.0/0                                                                                                                                                                                                                                     

In [13]:
def remove_urls(row):
    if not pd.isnull(row['urls']):
        urls = row['urls'].split(',')
        d = row['comm_body'] 
        for u in urls:
            d = d.replace(u.strip(), ' ')
        return d
    return row['comm_body']

In [14]:
data['1_url_cleaned_comm_body'] = data.apply(lambda _row: remove_urls(_row), axis=1)
# data['1_url_cleaned_comm_body'].head(5).values

## Removing words which do not mean anything

In [15]:
sentences = data['1_url_cleaned_comm_body'].unique()
words = dict()
for sent in sentences:
    sent = sent.lower().replace('\n', ' ').replace('\t', ' ')
    sent_tokens = sent_tokenize(sent)
    for s_token in sent_tokens:
        tokens = word_tokenize(s_token)
        for t in tokens:
            if t in words:
                words[t] += 1
            else:
                words[t] = 1

In [16]:
print(len(words))

35375


---


words which are of the pattern "i-(0-9)words"

In [17]:
words_to_remove = []
for w in words:
    match = re.search('i-[0-9]\w+', w)
    if match:
        if words[w] > 5:
            print(match.string, words[w])
        words_to_remove.append(match.string)

i-04b6c49211713957a 7
i-059d1a1009fd35d73 7
i-09c88615ffc5b0ca6 15
i-0f5132ec4afad9364 13
i-09b0cdbecf72224cb 6
i-06983d10fcba71498 6
i-005f131e3c567648b 12
i-0c27554a82026ead7 7
i-0ef724ab7fc1b2e69 9
i-0d73994dcdb72a38d 12
i-0a5f16475cb4aaf5c 9
i-03a20a0076d31e514 6
i-0beec32116a2693e8 6
i-0c1734712b8b9b2c9 7
i-09e46ba27318d150e 8
i-08363e917874c7560 6
i-08d5d95b040cc9d89 13
i-09e79f6f13979da13 12
i-0ca54f7d1809fb59a 6
i-0b085da52bf68d27b 7
i-0a93406eb813b0a4b 8
i-08128d8eefacb10ca 6
i-067f813183a8f516c 6
i-0ce313a417b1ee1e2 13
i-06e7dcd54f70238b1 9
i-0e2b8f26fdb3e948b 6
i-0056b930a5d56f608 10
i-07d1b96631391ddfb 7
i-0f7b2e7735aa0387e 8
i-0ebeb8802717cae72 9
i-0cb2c3f9541efc2aa 7
i-0ce3a60de46db7522 7
i-042838ac3c4e32ddc 9
i-04699d80622ea743c 8
i-0269a0b0823d403b9 6
i-0812e232104f2f48f 8
i-0cd325021dfc8e4d6 7
i-006fb0e6eca439bd2 13
i-0b7a8ea755207a2b3 8
i-0c68498b654180b03 8
i-0b525f5106eebb334 8
i-052626c80f61de1eb 7
i-064121555b548146a 6
i-02b8705f8ff02bb6c 8
i-0cf3cdfeeaf898bb7 8
i

In [18]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

35375
32555


---

words which are sperated by atleast 1 ':'

In [19]:
words_to_remove = []
for w in words:
    match = re.search('\w+:\w+', w)
    if match:
        if words[w] > 5:
            print(match.string, words[w])
        words_to_remove.append(match.string)

ap-south-1:617558729962 11
3:30 11
3:00 7
3:20 7
retry_handler.go:101 6
retry_handler.go:57 9
00:44 17
19:00 7
5:30 11
00:00 14
03:50:12 12
17:00 10
18:30 8
259:1 9
259:2 8
259:0 9
259:3 7
19:37:13 7
15:19:36 16
__ext4_get_inode_loc:4341 6
14:47:46 15
12:30 13
14:00 13
15:30 7
gmt+5:30 7
10:30 7
11:30 11
01:00 6
utc+5:30 15
14:16 27
13:40 8
11:00 8
00:00:00 29
9:30 7
15:00 15
10:00 8
localhost:9097 6
error:140770fc 6
dpt:22 10
15:29 6
1:00 7
5:00 9
13:30 7
8:00 6
08:20:43 8
09:50:13 8
12:00:00 11
18:00 6
22:00 6
253:0 6
12:38:20 6
21:30 6
11:35 6
172.31.0.2:53 6
19:30 7
06:00 13
08:30 14
23:59 12
16:00 7
202:0 12
202:1 12
202:80 9
202:81 6
file-rss:0kb 17
18:00:00 9
shmem-rss:0kb 24
14:00:14 17
11:40:01 12
02:07:22 6
3:38:42 16
23:00 7
08:00 12
13:16:00 17
08:08:16 10
08:20 6
00:12:38 9
12:00:37 7
05:03:13 17
21:28:54 10
uid:0 8
20:00 6
12:41:22 19
07:23:30 9
19:57:18 10
start=09:34:20 12


In [20]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

32555
30116


---

words which are sperated by atleast 2 '/'

In [21]:
words_to_remove = []
for w in words:
    match = re.search('\w+\/\w+\/\w+', w)
    if match:
        if words[w] > 5:
            print(match.string, words[w])
        words_to_remove.append(match.string)

/etc/ssh/ssh_config 12
/opt/mssql/bin/mssql-conf 6
/var/log/audit 9
/var/log/messages 54
/etc/default/grub 28
/usr/bin/sudo 12
/usr/bin/python 7
2019/12/09 11
2021/07/02 12
/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json 8
2021/07/29 11
/var/cache/apt/archives/nginx_1.20.0-1~xenial_amd64.deb 7
/usr/bin/cloud-init 10
/usr/lib/python3/dist-packages/pkg_resources/__init__.py 13
/usr/lib/python3/dist-packages/cloudinit/cmd/main.py 6
/etc/ssh/sshd_config 30
/etc/security/limits.conf 6
/engage_mobile/rbl_user_batch/rbluserdatasync/rblsalslipsync.sh 7
/var/log/secure 12
6/15/2021 6
/shared/microfocus/cobol_40_pu10/lib 8
/var/opt/stp/obe/obe3f/ 10
/etc/cloud/cloud.cfg 18
/var/log/syslog 19
/etc/sysconfig/network-scripts/ifcfg-eth0 14
/boot/grub2/grub.cfg 24
/var/log/dmesg 10
2020/09/12 6
/mnt/etc/fstab 6
/var/empty/sshd 6
/var/log/amazon/ssm/amazon-ssm-agent.log 11
/proc/sys/kernel/hung_task_timeout_secs 7
/usr/lib/libdl.so.2 6
/opt/aws/amazon-cloudwatch-agent/bin/config.json

In [22]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

30116
29379


Note: there are words which can be of use which are seperated by just /

---

words which are like patterns 'word/words/words'

In [23]:
words_to_remove = []
for w in words:
    match = re.search('.\/\w+\/\w+', w)
    if match:
#         if words[w] > 10:
#             print(match.string, words[w])
        print(match.string, words[w])
        words_to_remove.append(match.string)

'/tsa_api/copy_fares_of_version2 2
./lazypay_analytics/lp_mix_panel.ibd 1
'/tmp/myfd=55 1
///home/containers.json 2
'/home/containers.json 2
'/home/ec2-user/ 1
'/dev/disk 1
:/home/testvm 1
:/home/pradeep.bodlapati 1
set-default=/boot/vmlinuz-3.10.0-957.10.1.el7.x86_64 1
//etc/yum.repo.d/bak 1
~/downloads/rr-ops-uswest.pem 1
all=/bin/bash 1
'/bin/su 1
'boot_image=/boot/vmlinuz-4.4.0-1117- 1
/tmp/spinnaker-0.repo 1
/etc/yum.repos.d/ 1
/etc/yum.repos.d/ 1
downloaddir=/root/rpms/ 2
'/dev/xvda 2
â/dev/hugepagesâ 1
â/dev/mqueueâ 1
â/dev/shmâ 1
disk=/dev/xvda 1
'/etc/sudoers 1
:/home/ubuntu 1
/tmp/growpart.28805 1
./aws/install 1
'/var/tmp/ 1
'/dev/sdf 2
'/grub2/i386-pc/increment.mod 1
//./pipe/docker_engine 1
'/etc/fstab 2
libdir=/usr/lib 1
'/dev/sdp 1
prefix=/opt/nginx 1
path=/home/ec2-user/.local/bin 1
dir=/mysqlbackup/binary_logs/ 2
set-default=/boot/vmlinuz-4.18.0-80.4.2.el8_1.x86_64 1
meta-data=/dev/nvme1n1 1
//bucket/backup/ 1
'/efc/fstab 1
'/boot/initramfs-3.10.0-11

In [24]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

29379
29331


---

words which are sperated by atleast 2 '-'

In [25]:
words_to_remove = []
for w in words:
    match = re.search('\w+-\w+-\w+', w)
    if match:
        if words[w] > 5:
            print(match.string, words[w])
#         print(match.string, words[w])
        words_to_remove.append(match.string)

us-west-2 26
us-east-1 111
eu-west-1 32
eu-west-2 7
ap-northeast-1 9
yyyy-mm-dd 549
ap-south-1 190
2021-02-19 7
region=us-east-1 10
modify-instance-metadata-options 6
ap-southeast-1 57
search-disabled-repos 8
yum-config-manager 14
2021-02-21 8
2021-03-16 11
us-east-1b 19
+91-961-968-2411 9
ip-10-102-113-249 17
eu-west-1a 6
ap-south-1c 51
ap-south-1a 77
ap-south-1b 44
us-east-1a 24
us-east-1c 8
us-east-1f 8
ip-10-0-0-221 8
global-db-master-1 19
aws-vsr-016 6
2021-05-30 33
apeu-pp04-wfe01 7
apeu-pp04-app01 7
apeu-sp03-sch01 7
apeu-pp04-wfe02 7
ip-172-18-101-203 6
gssapi-with-mic 9
get-caller-identity 12
modify-instance-attribute 20
aws-vsr-032 6
ssm-agent-worker 7
eu-central-1 24
eu-west-1c 8
2020-11-09 7
content-transfer-encoding 34
no-verify-ssl 13
ap-southeast-1a 21
decode-authorization-message 12
prod-wedz-gtm-br-mongo-2 6
describe-instance-information 8
themis-ci-slave-2682ef81-8dd6-43ab-ac0c-ce7efae1da73-m5x9-3qx89 7
ip-10-101-57-84.ap-southeast-1.compute.internal 6
94dd0e2b-03cc-1

In [26]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

29331
27545


Note : there are words which can be useful and are seperated by single -
    
---


words which are sperated by atleast 2 '.'

In [27]:
words_to_remove = []
for w in words:
    match = re.search('\w+\.\w+\.\w+', w)
    if match:
        if words[w] > 5:
            print(match.string, words[w])
#         print(match.string, words[w])
        words_to_remove.append(match.string)

3.7.5 6
3.10.0-514.16.1. 19
1.14.0-0+xenial1 6
1.16.1-0+xenial1 6
5.3.18-24.64-def 8
4.14.198-152.320 26
glibc-2.17-317.el7.i686 7
8.3.4 7
v1.3.2.53 6
_.errordetails.message 12
x.x.x.x 6
//aws.amazon.c 12
2.4.0 8
3.10.0-1160.25.1.el7.x86_64 6
liblber-2.4.so.2 6
3.10.0-1127.8.2. 6
:1.8.0_144 6


In [28]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

27545
26746


Note there should be some words seperated by just 1 '.'

---

matching the pattern with  "digits with words"

In [29]:
words_to_remove = []
for w in words:
    match = re.search('\d+', w)
    if match:
        if words[w] > 5:
            print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

ec2  (count=3361)
1.  (count=581)
2.  (count=542)
3.  (count=387)
4.  (count=246)
5.  (count=145)
1/2  (count=173)
6.  (count=89)
3  (count=1033)
72  (count=1364)
2/2  (count=50)
403  (count=40)
7  (count=270)
80  (count=107)
2  (count=2140)
256  (count=9)
25  (count=107)
s3  (count=500)
10  (count=220)
16  (count=99)
2020  (count=154)
-1  (count=31)
502  (count=10)
tlsv1.2  (count=22)
0  (count=1033)
http/1.1  (count=27)
26th  (count=12)
2021  (count=197)
100  (count=202)
1  (count=3135)
linux2  (count=16)
5  (count=440)
6  (count=290)
300  (count=32)
60  (count=60)
48  (count=25)
22nd  (count=9)
m5a  (count=21)
r4  (count=33)
15  (count=110)
13th  (count=6)
zee5  (count=6)
t3a.xlarge  (count=12)
31st  (count=6)
r5a.4xlarge  (count=7)
+91  (count=119)
m4  (count=58)
c5  (count=58)
19  (count=34)
36  (count=16)
23  (count=48)
862423538905  (count=6)
1000  (count=36)
17  (count=44)
7.  (count=52)
8.  (count=41)
9.  (count=22)
5000  (count=10)
2017  (count=19)
t2  (count=81)
ec2-instance

In [30]:
for element in ['ec2', 'ec2-user', "'ec2",  ]:
    words_to_remove.remove(element)

In [31]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

26746
17841


---

matching the pattern with  "//"

In [32]:
words_to_remove = []
for w in words:
    match = re.search('//', w)
    if match:
        print(match.string, words[w])
        words_to_remove.append(match.string)

// 113
//map.json 1
//console 3
//consol 3
//a 3
//www 1
//con 2
//aws.ama 5
//aws.amazo 5
//docs 4
//aw 5
//aws 6
//blog.e-z 1
//aws.amaz 6
//d 2
//mapping.json 1
//pg-documents/ 1
//ap-south 2
//aws.a 5
//*.windows 1
//blanktarget 1
//who.is/whois-ip/ip-address/ 1
//c 2
//do 2
transitional//en 1
unrestricted// 1
//cons 2
//doc 1
//./administrators 1
//./remote 1
//ad/ 1
//console.a 1


In [33]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

17841
17809


---

matching the pattern with  "\\\\"

In [34]:
words_to_remove = []
for w in words:
    match = re.search("\\\\", w)
    if match:
#         if words[w] > 5:
#             print(f"{match.string}  (count={words[w]})")
        print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

\  (count=222)
\programdata\amazon\ssm  (count=1)
\program  (count=48)
files\amazon\ssm  (count=1)
\searchengine  (count=3)
endlessaisle\image  (count=3)
engine\server  (count=2)
\t\  (count=3)
files\amazon\xentools\liteagent.exe  (count=1)
authority\system  (count=3)
upload\download  (count=1)
\users\pimappdev  (count=1)
\users\pimappdev.edelcap  (count=1)
disablerepo=\*  (count=1)
\n  (count=11)
\users\administrator  (count=1)
\users\manjunath.n  (count=1)
files\amazon\awscli\bin\  (count=1)
files\amazon\awscli\runtime\lib\site-packages\awscli\clidrive  (count=6)
files\amazon\awscli\runtime\lib\site-packages\botocore\sessio  (count=1)
files\amazon\awscli\runtime\lib\site-packages\botocore\hooks  (count=3)
files\amazon\awscli\runtime\lib\site-packages\awscli\customiz  (count=1)
ations\  (count=1)
___|\___|___|  (count=1)
exceeded\n\tstatus  (count=4)
\users\ilawsadmin  (count=4)
hkey_local_machine\software\microsoft\windows  (count=4)
nt\currentversion\softwareprotectionplatform\  (co

In [35]:
# for element in [
#     'engine\server', 'authority\system', 'upload\download', "ations\\", "programfiles\trend", 'micro\deep',
#     'denied\\', 'microsoft\windows', 'page\\', 'of\\', 'aws\\', 'internet\network', 'panel\network', 'files\apache',
#     'foundation\tomcat', 'manager\memory', 'chat\call', '\\nreading', '\\nresolving', '\\n\\nproblem', 'provided\\n ',
#     'files\amazon'
# ]:
#     try:
#         words_to_remove.remove(element)
#     except:
#         print(element)

In [36]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

17809
17559


---
pattern with ./

In [37]:
words_to_remove = []
for w in words:
    match = re.search('\./', w)
    if match:
#         if words[w] > 10:
#             print(f"{match.string}  (count={words[w]})")
        print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

./install.sh  (count=2)
./p.pem  (count=1)
./nitro_check_script.sh  (count=1)
./configure  (count=7)
./*.rpm  (count=1)
./setup.exe  (count=1)
./install  (count=2)
./awscli-bundle/install  (count=1)
./installer_linux.py  (count=1)
./dockercli.exe  (count=1)


In [38]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

17559
17549


---
pattern with pattern "/words/"

In [39]:
words_to_remove = []
for w in words:
    match = re.search('/\w+/', w)
    if match:
#         if words[w] > 10:
#             print(f"{match.string}  (count={words[w]})")
        print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

/users/shhjs/.ssh/config  (count=1)
/home/jenkins  (count=3)
/var/log/  (count=15)
/etc/*release  (count=3)
/var/log  (count=36)
/var/tmp  (count=3)
/dev/shm  (count=8)
/usr/tmpdisk  (count=1)
/bin/bash  (count=32)
/proc/cpuinfo  (count=7)
/dists/bionic-updates/inrelease  (count=1)
/dists/bionic-backports/inrelease  (count=1)
/dists/bionic-security/inrelease  (count=1)
/bin/yum  (count=2)
/latest/user-data  (count=4)
/home/dansaha/.ssh/config  (count=2)
/hana/shared  (count=5)
/opt/adobe  (count=1)
/opt/adobe-new  (count=1)
/usr/local  (count=2)
/dev/xvda  (count=30)
/etc/fstab  (count=157)
/etc/os-release  (count=4)
/sbin/plymouthd  (count=1)
/bin/rm  (count=1)
/home/ubuntu  (count=8)
/home/ubuntu/.ssh  (count=3)
/home/ubuntu/.ssh/authorized_keys  (count=4)
/tmp/agentdependencies  (count=4)
/hana/log  (count=1)
/hana/data  (count=4)
/usr/sap  (count=1)
/etc/resolv.conf  (count=21)
/dev/sdz  (count=2)
/dev/xvdj  (count=3)
/proc/filesystems  (count=1)
/dev/sd*  (count=1)
/etc/rsyslog.d/

In [40]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

17549
17318


---
pattern with pattern "_words"

In [41]:
words_to_remove = []
for w in words:
    match = re.search('_[a-z]+', w)
    if match:
#         if words[w] > 10:
#             print(f"{match.string}  (count={words[w]})")
        print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

statuscheckfailed_system  (count=62)
nitro_check  (count=1)
dbschema_tables_info  (count=2)
'pageiolatch_ex  (count=1)
id_like=  (count=5)
version_id=  (count=7)
pretty_name=  (count=7)
ansi_color=  (count=6)
cpe_name=  (count=6)
home_url=  (count=6)
kb_read/s  (count=1)
kb_wrtn/s  (count=1)
kb_read  (count=1)
kb_wrtn  (count=1)
profile_name='stag-mfa  (count=1)
describe_output=  (count=1)
user_xattr  (count=1)
.skip_if_unavailable=true  (count=1)
x_get_es_cache_info  (count=1)
x_check_health  (count=1)
x_check_disk  (count=1)
m_events  (count=1)
constant_tsc  (count=9)
nonstop_tsc  (count=10)
current_clocksource  (count=4)
available_clocksource  (count=4)
unioniq_services_ingenico_agreement_final_compressed.pdf  (count=1)
unioniq_services_ingenico_agreement_final_compressed.json  (count=1)
iepl_pg_me_kit_nifara  (count=1)
iepl_pg_me_kit_nifara.json  (count=1)
iepl_pg_me_kit_riya_cabs_trip_private_limited  (count=1)
iepl_pg_me_kit_riya_cabs_trip_private_limited.json  (count=1)
completi

In [42]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

17318
16595


---
words of length 3 or less

In [43]:
words_to_remove = []
for w in words:
    if len(w) < 3:
        print(f"{w}  (count={words[w]})")
        words_to_remove.append(w)

,  (count=52022)
i  (count=16729)
am  (count=1685)
in  (count=12328)
of  (count=14742)
(  (count=8324)
)  (count=8873)
it  (count=7142)
is  (count=13333)
up  (count=1133)
.  (count=62331)
at  (count=5671)
pm  (count=328)
to  (count=48397)
:  (count=32061)
``  (count=6474)
''  (count=7080)
id  (count=3200)
s  (count=3050)
we  (count=15738)
a  (count=14413)
us  (count=6159)
no  (count=3030)
on  (count=11897)
if  (count=9187)
be  (count=9407)
do  (count=3413)
so  (count=2900)
as  (count=9622)
!  (count=1734)
hi  (count=3041)
r  (count=985)
an  (count=5604)
me  (count=2117)
my  (count=2486)
&  (count=562)
by  (count=7492)
'm  (count=440)
or  (count=6009)
-  (count=5221)
{  (count=715)
ip  (count=1240)
s.  (count=375)
'  (count=2308)
az  (count=158)
?  (count=3752)
c  (count=1139)
>  (count=2002)
's  (count=961)
v.  (count=84)
%  (count=731)
;  (count=917)
'd  (count=100)
ad  (count=69)
hh  (count=548)
mm  (count=548)
ss  (count=552)
vm  (count=54)
/  (count=400)
db  (count=133)
[  (count=7

In [44]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

16595
16128


In [45]:
language_dict = enchant.Dict("en_US")
words_to_remove = []
for w in words:
    if len(w) < 4 and not language_dict.check(w):
        print(f"{w}  (count={words[w]})")
        words_to_remove.append(w)

ec2  (count=3361)
ist  (count=398)
ami  (count=1756)
ebs  (count=885)
kms  (count=129)
cmk  (count=53)
aws  (count=22912)
n't  (count=3638)
url  (count=3695)
rca  (count=54)
api  (count=573)
anu  (count=55)
sdk  (count=42)
asg  (count=191)
ips  (count=151)
cli  (count=389)
iam  (count=397)
cpu  (count=874)
arn  (count=214)
acm  (count=1)
sla  (count=15)
nat  (count=128)
vpc  (count=458)
eip  (count=107)
utc  (count=1372)
kie  (count=2)
sr.  (count=2)
gmt  (count=46)
emr  (count=33)
ott  (count=1)
raj  (count=3)
dec  (count=103)
plz  (count=8)
ena  (count=437)
n/w  (count=4)
ssm  (count=604)
tcp  (count=214)
sql  (count=363)
rtm  (count=2)
sai  (count=60)
iad  (count=2)
cwe  (count=2)
nic  (count=43)
dns  (count=248)
efs  (count=48)
.if  (count=1)
rds  (count=128)
vpn  (count=100)
've  (count=335)
npm  (count=26)
ssl  (count=93)
smp  (count=15)
mon  (count=19)
aug  (count=28)
id=  (count=6)
oct  (count=95)
adr  (count=1)
sar  (count=57)
tps  (count=3)
ecr  (count=19)
cmd  (count=31)
pls

In [46]:
for element in [
    'ec2', 'ami', 'ebs', 'kms', 'aws', 'cmk', 'url', 'web', 'oem', 'asn', 'gps', 'jre', 'isn', 'jmx', 'gdb', 'eol', 'gpt', 'mbr', 
    'isp', 'ide', 'phd', 'faq', 'gcc', 'vnc', 'udp', 'ntp', 'nfs', 'gpu', 'pid', 'lvm', 'gui', 'ecs', 'pem', 'awk', 'ecs', 'dms',
    'msg', 'pfa', 'rdp', 'pci', 'sep', 'fri', 'tue', 'feb', 'thu', 'gpl', 'sns', 'sme', 'acl', 'cmd', 'ecr', 'aug', 'mon', 'vpn', 
    'ssl', 'npm', 'vpn', 'rds', 'efs', 'dns', 'tcp', 'sql', 'ena', 'dec', 'emr', 'gmt', 'vpc', 'nat', 'acm', 'arn', 'cli', 'sdk',
    'api'
]:
    try:
        words_to_remove.remove(element)
    except:
        print(element)


web
ecs
vpn


In [47]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

16128
15504


---
words starting with : or - or has = in them

In [48]:
words_to_remove = []
for w in words:
    match = re.search('^:\w+|^-\w+|\w+=\w+', w)
    if match:
#         if words[w] > 10:
#             print(f"{match.string}  (count={words[w]})")
        print(f"{match.string}  (count={words[w]})")
        words_to_remove.append(match.string)

-ano  (count=5)
circle=all  (count=2)
operator=all  (count=2)
mode=app  (count=2)
:describeinstances  (count=14)
key=team  (count=1)
value=hmsexp  (count=1)
-version  (count=6)
label=cloudimg-rootfs  (count=2)
errors=remount-ro  (count=1)
-euo  (count=1)
-like  (count=3)
baseurl=file  (count=1)
gpgkey=file  (count=5)
enablerepo=offline-httpd  (count=1)
-gt  (count=5)
-instance  (count=2)
:instance  (count=2)
-computername  (count=10)
-port  (count=25)
-if  (count=4)
name=instanceid  (count=1)
-vvv  (count=11)
-testing  (count=5)
:describeavailabilityzones  (count=1)
-arn  (count=3)
tclass=capability  (count=1)
success=yes  (count=1)
-ll  (count=1)
:natgateway  (count=2)
:cloudformation  (count=1)
:stack  (count=1)
mailer=esmtp  (count=1)
stat=deferred  (count=1)
setopt=docke  (count=1)
name=docker  (count=9)
-way  (count=1)
-linux  (count=1)
-keygen  (count=1)
-https  (count=2)
key=name  (count=4)
key=aws  (count=2)
-ling  (count=1)
-adminaddress  (count=7)
-bearertoken  (count=6)
-sou

In [49]:
print(len(words))
for w in words_to_remove:
    words.pop(w)
print(len(words))

15504
15074


---

In [None]:
_temp = [k for k,v in words.items() if v == 1]
print(len(_temp))
print(_temp)

---

# Standard Cleaning

---
punctuations and unicode cleaning

In [53]:
punct_dict = {key: ' ' for key in string.punctuation}
def remove_punctuation(sent):
    sent = sent.lower().replace('\n', ' ').replace('\t', ' ')
    sent_tokens = sent_tokenize(sent)
    result = ''
    for s_token in sent_tokens:
        valid_tokens = []
        tokens = word_tokenize(s_token)
        for _t in tokens:
            if _t in words.keys():
                valid_tokens.append(_t)
        result = result + ' '.join(valid_tokens) + ' ' 
    result = result.translate(str.maketrans(punct_dict))
    result = ' '.join([x.strip() for x in result.split()])
    result = re.sub(r'[^\x00-\x7f]',r'', result) 
    return result

In [54]:
data['2_punctuation_removal'] = data['1_url_cleaned_comm_body'].apply(lambda x: remove_punctuation(x))
data['2_punctuation_removal'].head(5).values

array(['dear team facing issue one the ec2 not coming kindly required your help please join meeting action taken launch ec2 take ami use this ami launch ec2 change ebs kms from aws ebs cmk ec2 launched status checks comes trial times purpose want migrate into another aws account and this ec2 contain aws default kms thanks nilay talati instance',
       'hello have heard back from you regarding case for while for continued support regarding the same issue you can contact any time using the following url please note that action required your part you wish this case resolved however you want close this case yourself you could via the support center have heard back from you within the next hours will mark the case resolved although you can re open the case any time using the link above best regards amazon web services',
       'while checking the instance the above instance status check would like know the cause that the instance was status check that can take more preventive measures inst

In [55]:
data['1_url_cleaned_comm_body'].head(5).values

array(['Dear Team,  I am facing issue in one of the EC2 (i-0e797d5b0d54cc3f4) it is not coming up. Kindly required your help. please join meeting at 12:05 PM IST.    Action I taken to launch EC2 (i-0e797d5b0d54cc3f4).: 1. Take AMI of  : i-0afdd40d680d5ed41 2. use this AMI to launch EC2: ami-0d261c2dff85df4d9 3. Change EBS KMS from "aws/ebs" to "cmk: 76dc5c61-d789-4f80-ac0b-21db46de0e47" 4. EC2 launched: i-0e797d5b0d54cc3f4 5. Status : 1/2 Checks comes 6.  Trial : 3 times  Purpose: I want to migrate i-0afdd40d680d5ed41 into another AWS account. and this EC2 contain aws default kms.  Thanks, Nilay Talati +919429142780 Instance ID(s):',
       "Hello,  We haven't heard back from you regarding case 7938887601 for a while. For continued support regarding the same issue, you can contact us any time using the following URL:    Please note that no action is required on your part if you wish this case to be resolved. However, if you want to close this case yourself, you could do so via the Supp

In [56]:
sentences = data['2_punctuation_removal'].unique()
words = dict()
for sent in sentences:
    sent = sent.lower().replace('\n', ' ').replace('\t', ' ')
    sent_tokens = sent_tokenize(sent)
    for s_token in sent_tokens:
        tokens = word_tokenize(s_token)
        for t in tokens:
            if t in words:
                words[t] += 1
            else:
                words[t] = 1

print(len(words))
_temp = [k for k,v in words.items() if v == 1]
print(len(_temp))
print(_temp)

11888
3860


---
lemmatization

In [57]:
lemmatizer = WordNetLemmatizer()
def lemmatization(x):
    x = x.lower().split()
    result = []
    for w in x:
        result.append(lemmatizer.lemmatize(w))
    return ' '.join(result)

data['3_post_lemmetization'] = data['2_punctuation_removal'].apply(lambda x: lemmatization(x))
data['3_post_lemmetization'].head(5).values

array(['dear team facing issue one the ec2 not coming kindly required your help please join meeting action taken launch ec2 take ami use this ami launch ec2 change eb km from aws eb cmk ec2 launched status check come trial time purpose want migrate into another aws account and this ec2 contain aws default km thanks nilay talati instance',
       'hello have heard back from you regarding case for while for continued support regarding the same issue you can contact any time using the following url please note that action required your part you wish this case resolved however you want close this case yourself you could via the support center have heard back from you within the next hour will mark the case resolved although you can re open the case any time using the link above best regard amazon web service',
       'while checking the instance the above instance status check would like know the cause that the instance wa status check that can take more preventive measure instance',
     

In [58]:
sentences = data['3_post_lemmetization'].unique()
words = dict()
for sent in sentences:
    sent = sent.lower().replace('\n', ' ').replace('\t', ' ')
    sent_tokens = sent_tokenize(sent)
    for s_token in sent_tokens:
        tokens = word_tokenize(s_token)
        for t in tokens:
            if t in words:
                words[t] += 1
            else:
                words[t] = 1

print(len(words))
_temp = [k for k,v in words.items() if v == 1]
print(len(_temp))
print(_temp)

11039
3638


---
stop word removal

In [59]:
stop_words = set(stopwords.words('english'))
data['4_stop_words_removed'] = data['3_post_lemmetization'].apply(lambda sent: ' '.join([w for w in sent.split() if not w.lower() in stop_words]))
data['4_stop_words_removed'].head(5).values

array(['dear team facing issue one ec2 coming kindly required help please join meeting action taken launch ec2 take ami use ami launch ec2 change eb km aws eb cmk ec2 launched status check come trial time purpose want migrate another aws account ec2 contain aws default km thanks nilay talati instance',
       'hello heard back regarding case continued support regarding issue contact time using following url please note action required part wish case resolved however want close case could via support center heard back within next hour mark case resolved although open case time using link best regard amazon web service',
       'checking instance instance status check would like know cause instance wa status check take preventive measure instance',
       'hello looked issue taking action towards think action requires end surely close case well known issue thank',
       'please let know helped resolve issue yes click click'],
      dtype=object)

In [60]:
sentences = data['4_stop_words_removed'].unique()
words = dict()
for sent in sentences:
    sent = sent.lower().replace('\n', ' ').replace('\t', ' ')
    sent_tokens = sent_tokenize(sent)
    for s_token in sent_tokens:
        tokens = word_tokenize(s_token)
        for t in tokens:
            if t in words:
                words[t] += 1
            else:
                words[t] = 1

print(len(words))
_temp = [k for k,v in words.items() if v == 1]
print(len(_temp))
print(_temp)

10911
3630


In [61]:
data.head()

Unnamed: 0,case_id,customer_name,service,case_billing_region,customer_billing_country_name,comm_owner_agent_login,comm_body,case_creation_cal_date,comm_date_utc,comm_subject,case_severity,urls,1_url_cleaned_comm_body,2_punctuation_removal,3_post_lemmetization,4_stop_words_removed
0,8415574821,Genpact,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,gupmanav,"Dear Team, I am facing issue in one of the EC...",6/2/2021 0:00,6/2/2021 0:00,1/2 checks EC2,4,https://genpact.zoom.us/my/talati,"Dear Team, I am facing issue in one of the EC...",dear team facing issue one the ec2 not coming ...,dear team facing issue one the ec2 not coming ...,dear team facing issue one ec2 coming kindly r...
1,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,"Hello, We haven't heard back from you regardi...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4,https://console.aws.amazon.com/support/home?#/...,"Hello, We haven't heard back from you regardi...",hello have heard back from you regarding case ...,hello have heard back from you regarding case ...,hello heard back regarding case continued supp...
2,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,while checking the instance the above instance...,2/1/2021 0:00,2/1/2021 0:00,1/2 status check,4,,while checking the instance the above instance...,while checking the instance the above instance...,while checking the instance the above instance...,checking instance instance status check would ...
3,7938887601,Axiata Digital Services Sdn Bhd,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,wadhwh,"Hello, We have looked into the issue and we ...",2/1/2021 0:00,2/9/2021 0:00,1/2 status check,4,,"Hello, We have looked into the issue and we ...",hello have looked into the issue and are takin...,hello have looked into the issue and are takin...,hello looked issue taking action towards think...
4,7745171561,Tata Communications Ltd.,Elastic Compute Cloud (EC2 - Linux),APAC,INDIA,arizona,Please let us know if we helped resolve your i...,12/16/2020 0:00,12/26/2020 0:00,2/2 check failed,3,https://console.aws.amazon.com/support/feedbac...,Please let us know if we helped resolve your i...,please let know helped resolve your issue yes ...,please let know helped resolve your issue yes ...,please let know helped resolve issue yes click...


In [63]:
data.columns

Index(['case_id', 'customer_name', 'service', 'case_billing_region',
       'customer_billing_country_name', 'comm_owner_agent_login', 'comm_body',
       'case_creation_cal_date', 'comm_date_utc', 'comm_subject',
       'case_severity', 'urls', '1_url_cleaned_comm_body',
       '2_punctuation_removal', '3_post_lemmetization',
       '4_stop_words_removed'],
      dtype='object')

In [64]:
data[
        [
            'case_id', 'customer_name', 'service', 'case_billing_region',
            'customer_billing_country_name', 'comm_owner_agent_login', 'comm_body',
            'case_creation_cal_date', 'comm_date_utc', 'comm_subject',
            'case_severity', 'urls', '4_stop_words_removed'
        ]
].to_pickle(f'{base_dir}/Data/topic_modeling_data.pkl')

In [None]:
# t = data['4_stop_words_removed'].unique()
# words = set()
# for v in t:
#     words.update(v.split())

# print(words)

# words = []
# for row in data.iterrows():
#     print(row['4_stop_words_removed'])


In [None]:
# language_dict = enchant.Dict("en_US")
# def is_not_blank(s):
#     return bool(s and not s.isspace())

# words_with_problem = []
# for k, v in words.items():
#     try:
# #             _word = contractions.fix(k.strip())
#         _word = k.strip()
#         if not language_dict.check(_word):
#             print(f"{k}: {v}")
#     except:
#         words_with_problem.append(k)

# words_with_problem

# contractions.fix(words_with_problem[0])

In [None]:
# sentence = """
# Hello We have n't heard back from you regarding case 7938887601 for a while For continued support regarding the same issue you can contact us any time using the following URL Please note that no action is required on your part if you wish this case to be resolved However if you want to close this case yourself you could do so via the Support Center If we have n't heard back from you within the next 72 hours we will mark the case as Resolved although you can re-open the case any time using the link above Best Regards Amazon Web Services
# """

# from pprint import pprint 

# nlp = spacy.load('en_core_web_sm')
# doc = nlp(sentence)
# pprint(" ".join([token.lemma_ for token in doc]))

# ps = PorterStemmer() 
# lemmatizer = WordNetLemmatizer() 
# for i in sentence.split():
#     print(f"{i}: Lemma={lemmatizer.lemmatize(i)}, Stem={ps.stem(i)}", ) 