# AWS Assignment - NLP & Clustering | Nishthavan Dahiya 

## 1. Read all files in the S3 bucket 

In [2]:
import boto3
S3 = boto3.resource("s3")

In [3]:
# Reading All the files in the S3 BUCKET
list_data = []
# Getting the file data from S3 bucket.
s3_bucket = S3.Bucket("awsassignmentbucket7")
for bucket_obj in s3_bucket.objects.all():
    list_data.append(bucket_obj.get()['Body'].read().decode("utf-8"))
list_data

['The electric vehicle (EV) revolution is speeding up, but it can only go so far without the necessary infrastructure and technology. As thinking shifts from fossil fuels to all-electric, visions of a brighter, more optimistic world come into view. The UK government’s pledge to ban the sale of all new non-electric cars, including gasoline, diesel and hybrid vehicles from 2035, highlights the drive to end the nation’s contribution to Climate Change by 2050. If the 2035 target is to be met, we will all see evolutions in the transport and mobility routines that keep our lives moving. From using ultra-fast wireless charging to supporting the developing world by repurposing car batteries, WMG, at the University of Warwick, is delivering advances in electrification knowledge and technologies, which will enable the leap to an electric automotive future. So, for the now and the near future, what do we need to consider?',
 'Demand for EVs is surging in the UK and registrations of plug-in cars i

## 2. Combine text from all files

In [4]:
str_data = " ".join(list_data)
str_data

'The electric vehicle (EV) revolution is speeding up, but it can only go so far without the necessary infrastructure and technology. As thinking shifts from fossil fuels to all-electric, visions of a brighter, more optimistic world come into view. The UK government’s pledge to ban the sale of all new non-electric cars, including gasoline, diesel and hybrid vehicles from 2035, highlights the drive to end the nation’s contribution to Climate Change by 2050. If the 2035 target is to be met, we will all see evolutions in the transport and mobility routines that keep our lives moving. From using ultra-fast wireless charging to supporting the developing world by repurposing car batteries, WMG, at the University of Warwick, is delivering advances in electrification knowledge and technologies, which will enable the leap to an electric automotive future. So, for the now and the near future, what do we need to consider? Demand for EVs is surging in the UK and registrations of plug-in cars increa

## 3. Parse text as sentences

In [5]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
     |████████████████████████████████| 12.8 MB 6.8 MB/s            
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
import pandas as pd
import numpy as np
import spacy
import string

In [7]:
NLP = spacy.load('en_core_web_sm')
data = NLP(str_data)
#Sentences ------
Sentences = []
for sentence in data.sents:
    Sentences.append(sentence.text)
Sentences

['The electric vehicle (EV) revolution is speeding up, but it can only go so far without the necessary infrastructure and technology.',
 'As thinking shifts from fossil fuels to all-electric, visions of a brighter, more optimistic world come into view.',
 'The UK government’s pledge to ban the sale of all new non-electric cars, including gasoline, diesel and hybrid vehicles from 2035, highlights the drive to end the nation’s contribution to Climate Change by 2050.',
 'If the 2035 target is to be met, we will all see evolutions in the transport and mobility routines that keep our lives moving.',
 'From using ultra-fast wireless charging to supporting the developing world by repurposing car batteries, WMG, at the University of Warwick, is delivering advances in electrification knowledge and technologies, which will enable the leap to an electric automotive future.',
 'So, for the now and the near future, what do we need to consider?',
 'Demand for EVs is surging in the UK and registratio

## 4. Parse sentence as words and Remove stop words

In [8]:
import nltk
nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

word_tokens = []
for sentence in Sentences:
    tokens_list = word_tokenize(sentence) 
    for word in tokens_list:
        if not word in stopwords.words() and not word in NLP.Defaults.stop_words:
            word_tokens.append(word)
len(word_tokens)
print(word_tokens)

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['The', 'electric', 'vehicle', '(', 'EV', ')', 'revolution', 'speeding', ',', 'far', 'necessary', 'infrastructure', 'technology', '.', 'As', 'thinking', 'shifts', 'fossil', 'fuels', 'all-electric', ',', 'visions', 'brighter', ',', 'optimistic', 'world', 'view', '.', 'The', 'UK', 'government', '’', 'pledge', 'ban', 'new', 'non-electric', 'cars', ',', 'including', 'gasoline', ',', 'diesel', 'hybrid', 'vehicles', '2035', ',', 'highlights', 'drive', 'nation', '’', 'contribution', 'Climate', 'Change', '2050', '.', 'If', '2035', 'target', ',', 'evolutions', 'transport', 'mobility', 'routines', 'lives', 'moving', '.', 'From', 'ultra-fast', 'wireless', 'charging', 'supporting', 'developing', 'world', 'repurposing', 'car', 'batteries', ',', 'WMG', ',', 'University', 'Warwick', ',', 'delivering', 'advances', 'electrification', 'knowledge', 'technologies', ',', 'enable', 'leap', 'electric', 'automotive', 'future', '.', 'So', ',', 'near', 'future', ',', 'need', 'consider', '?', 'Demand', 'EVs', 's

In [9]:
custom_punc = [',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
    '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›','♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
    '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾','═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
    '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲','è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
    '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø','¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
word_tokens = ["".join(c for c in token if c not in string.punctuation and c not in custom_punc) for token in word_tokens]
word_tokens = [token for token in word_tokens if token] # Remove empty 
print(word_tokens)

['The', 'electric', 'vehicle', 'EV', 'revolution', 'speeding', 'far', 'necessary', 'infrastructure', 'technology', 'As', 'thinking', 'shifts', 'fossil', 'fuels', 'allelectric', 'visions', 'brighter', 'optimistic', 'world', 'view', 'The', 'UK', 'government', 'pledge', 'ban', 'new', 'nonelectric', 'cars', 'including', 'gasoline', 'diesel', 'hybrid', 'vehicles', '2035', 'highlights', 'drive', 'nation', 'contribution', 'Climate', 'Change', '2050', 'If', '2035', 'target', 'evolutions', 'transport', 'mobility', 'routines', 'lives', 'moving', 'From', 'ultrafast', 'wireless', 'charging', 'supporting', 'developing', 'world', 'repurposing', 'car', 'batteries', 'WMG', 'University', 'Warwick', 'delivering', 'advances', 'electrification', 'knowledge', 'technologies', 'enable', 'leap', 'electric', 'automotive', 'future', 'So', 'near', 'future', 'need', 'consider', 'Demand', 'EVs', 'surging', 'UK', 'registrations', 'plugin', 'cars', 'increased', '160000', '2013', '2018', 'With', 'electrification', 'i

## 5. Stemming of words

In [10]:
from nltk.stem.porter import *
stemmer = PorterStemmer()
final_tokens = []
for token in word_tokens:
    final_tokens.append(stemmer.stem(token))
print(final_tokens)

['the', 'electr', 'vehicl', 'EV', 'revolut', 'speed', 'far', 'necessari', 'infrastructur', 'technolog', 'As', 'think', 'shift', 'fossil', 'fuel', 'allelectr', 'vision', 'brighter', 'optimist', 'world', 'view', 'the', 'UK', 'govern', 'pledg', 'ban', 'new', 'nonelectr', 'car', 'includ', 'gasolin', 'diesel', 'hybrid', 'vehicl', '2035', 'highlight', 'drive', 'nation', 'contribut', 'climat', 'chang', '2050', 'If', '2035', 'target', 'evolut', 'transport', 'mobil', 'routin', 'live', 'move', 'from', 'ultrafast', 'wireless', 'charg', 'support', 'develop', 'world', 'repurpos', 'car', 'batteri', 'wmg', 'univers', 'warwick', 'deliv', 'advanc', 'electrif', 'knowledg', 'technolog', 'enabl', 'leap', 'electr', 'automot', 'futur', 'So', 'near', 'futur', 'need', 'consid', 'demand', 'ev', 'surg', 'UK', 'registr', 'plugin', 'car', 'increas', '160000', '2013', '2018', 'with', 'electrif', 'industri', 'estim', 'worth', '6bn', '2025', 'decad', 'present', 'massiv', 'opportun', 'howev', 'ev', 'remain', 'outskir

In [23]:
Results = " ".join(final_tokens)
f = open("Results.txt", "a")
f.write(Results)

1498

## 6.  Store results in S3 bucket

In [25]:
# Uploading File under a folder named results and name of file is Results.txt
S3 = boto3.resource("s3")
s3_bucket = S3.Bucket("awsassignmentbucket7")
s3_bucket.upload_file("Results.txt", "results/Results.txt")

## 7.  Read same stored results from S3 Bucket

In [26]:
# Getting the file we uploaded previously
obj = S3.Object(bucket_name = "awsassignmentbucket7",key ="results/Results.txt")
final_data = obj.get()['Body'].read().decode("utf-8")
final_data

'the electr vehicl EV revolut speed far necessari infrastructur technolog As think shift fossil fuel allelectr vision brighter optimist world view the UK govern pledg ban new nonelectr car includ gasolin diesel hybrid vehicl 2035 highlight drive nation contribut climat chang 2050 If 2035 target evolut transport mobil routin live move from ultrafast wireless charg support develop world repurpos car batteri wmg univers warwick deliv advanc electrif knowledg technolog enabl leap electr automot futur So near futur need consid demand ev surg UK registr plugin car increas 160000 2013 2018 with electrif industri estim worth 6bn 2025 decad present massiv opportun howev ev remain outskirt mainstream consum offer match model usabl conveni afford convent vehicl offer today accord professor david greenwood He drive forward 2m innov ukfund multi optim solut energi storag system mosess project consortium led mclaren automot includ project partner a123 system reduc size weight emiss current ev the vi

In [27]:
# We can get back tokens by just splitting it with " " Because previously thats how we saved it.
TOKENS = final_data.split(" ")
print(TOKENS)

['the', 'electr', 'vehicl', 'EV', 'revolut', 'speed', 'far', 'necessari', 'infrastructur', 'technolog', 'As', 'think', 'shift', 'fossil', 'fuel', 'allelectr', 'vision', 'brighter', 'optimist', 'world', 'view', 'the', 'UK', 'govern', 'pledg', 'ban', 'new', 'nonelectr', 'car', 'includ', 'gasolin', 'diesel', 'hybrid', 'vehicl', '2035', 'highlight', 'drive', 'nation', 'contribut', 'climat', 'chang', '2050', 'If', '2035', 'target', 'evolut', 'transport', 'mobil', 'routin', 'live', 'move', 'from', 'ultrafast', 'wireless', 'charg', 'support', 'develop', 'world', 'repurpos', 'car', 'batteri', 'wmg', 'univers', 'warwick', 'deliv', 'advanc', 'electrif', 'knowledg', 'technolog', 'enabl', 'leap', 'electr', 'automot', 'futur', 'So', 'near', 'futur', 'need', 'consid', 'demand', 'ev', 'surg', 'UK', 'registr', 'plugin', 'car', 'increas', '160000', '2013', '2018', 'with', 'electrif', 'industri', 'estim', 'worth', '6bn', '2025', 'decad', 'present', 'massiv', 'opportun', 'howev', 'ev', 'remain', 'outskir

## 8. Label encoding on all words

In [28]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
encoded_words = LE.fit_transform(TOKENS)
print(encoded_words)

[142  46 153   9 126 134  58  94  75 141   8 143 130  60  62  19 155  25
 101 162 154 142  13  65 108  22  96  97  27  72  64  43  70 153   4  68
  44  92  34  30  28   5  11   4 140  54 145  87 128  81  90  61 147 159
  29 137  42 162 125  27  23 161 148 156  40  17  47  78 141  50  79  46
  21  63  12  93  63  95  31  41  53 138  13 121 109  27  73   0   1   2
 160  47  74  52 163   7   3  39 112  83  99  69  53 124 102  82  33  98
  84  88 151  35  18  36 153  98 144  16 115  38  66  10  44  59   6  76
 146  91 100 133  51 135 139  89 116  32  80  85  21  72 116 103  14 139
 120 131 157  48  37  53 142 155  71  20 106 123 149 111 114  23 133  84
 106  36  64  43 153  86  33  55  67  44 150  70  46 145 137  65 127 164
 136 127 145  49   5 142 119 104  26  46  27 144  56 158 129  23 118 113
 117 122 123  29 110 142  37  24 141  15  86  95 132 105 152  95 107  45
  23  29  75  77  57  66]


## 9. K-Means clustering 

In [29]:
X = encoded_words.reshape(-1,1)
from sklearn.cluster import KMeans
model = KMeans(n_clusters=6, random_state = 710).fit(X)
out = model.predict(X)
print(out)

[4 0 4 5 1 1 3 2 3 4 5 4 1 3 3 5 4 5 2 4 4 4 5 3 2 5 2 2 0 3 3 0 3 4 5 3 0
 2 0 0 0 5 5 5 4 3 4 2 1 2 2 3 4 4 0 4 0 4 1 0 5 4 4 4 0 5 0 3 4 0 3 0 5 3
 5 2 3 2 0 0 3 4 5 1 1 0 3 5 5 5 4 0 3 3 4 5 5 0 1 2 2 3 3 1 2 2 0 2 2 2 4
 0 5 0 4 2 4 5 1 0 3 5 0 3 5 3 4 2 2 1 0 1 4 2 1 0 2 2 5 3 1 2 5 4 1 1 4 0
 0 3 4 4 3 5 2 1 4 1 1 5 1 2 2 0 3 0 4 2 0 3 3 0 4 3 0 4 4 3 1 4 4 1 4 0 5
 4 1 2 0 0 0 4 3 4 1 5 1 1 1 1 1 0 1 4 0 5 4 5 2 2 1 2 4 2 2 0 5 0 3 3 3 3]


In [30]:
print(model.cluster_centers_)

[[ 38.09756098]
 [122.35483871]
 [ 94.        ]
 [ 65.27027027]
 [148.57777778]
 [ 13.24242424]]
