## Python Regular Expressions
Regular expressions are a powerful tool for various kinds of string manipulation.
<br>Regular expressions in Python can be accessed using the **re** module, which is part of the standard library.  


In [4]:
import re

In [35]:
import time
import requests
import numpy as np
import pandas as pd
from textblob import TextBlob, Word
from scrapy.http import TextResponse

In [5]:
name = "Hrant Davtyan"

In [6]:
type(name)

str

In [7]:
print(name.capitalize())
print(name.upper())
print(name.lower())

Hrant davtyan
HRANT DAVTYAN
hrant davtyan


In [8]:
# get an index of a given string
print(name.find("H"))
print(name.find("D"))
print(name.find("y"))
print(name[0],name[6],name[10])#just checking

0
6
10
H D y


In [9]:
"D" in name

True

In [10]:
"d" in name

False

In [11]:
#counts how many times a given character appears in a string
print(name.count("D"))
print(name.count("a"))


1
3


In [12]:
my_sentence = "Hrant is in Dilijan. Hrant is teaching."
my_sentence.count("Hrant")

2

In [13]:
my_sentence.find("is")

6

<b>The strip() removes white space from both left and right sides

In [14]:
name_with_spaces = "       Hrant       Davtyan "

In [15]:
name_with_spaces.strip()

'Hrant       Davtyan'

<b> The split() method splits a string into a list.

You can specify the separator, default separator is any whitespace.

In [16]:
name.split()

['Hrant', 'Davtyan']

In [17]:
name.split("a")## now the separator is the 'a'

['Hr', 'nt D', 'vty', 'n']

In [18]:
name.replace("y","i")

'Hrant Davtian'

In [19]:
#key string methods:
#find(), replace(), strip(), split(), count()

In [20]:
# . = anything (including nothing, just a space)
# * = repetition 0 or more times
# + = repetition 1 or more times

In [21]:
re.findall("Davt.+",name)

['Davtyan']

In [22]:
my_email = "Hdavtyan@aua.am"

In [23]:
re.findall(".+@.+",my_email)

['Hdavtyan@aua.am']

In [24]:
sentence = "Hrant Davtyans email is hdavtyan@aua.am"

In [0]:
re.findall(" .+@.+",sentence)

[' Davtyans email is hdavtyan@aua.am']

In [0]:
new_sentence = "Republic of Armenia received its independence in 1991000 1991"

In [0]:
re.findall("\d{4}",new_sentence)

['1991', '1991']

In [0]:
re.compile("\d{4}") #string as an input, pattern as an output

re.compile(r'\d{4}', re.UNICODE)

In [0]:
re.sub(".+ of","Kingdom of",new_sentence)

'Kingdom of Armenia received its independence in 1991000 1991'

In [0]:
new_sentence = re.sub(".+ of","Kingdom of",new_sentence)

In [0]:
new_sentence

'Kingdom of Armenia received its independence in 1991000 1991'

In [0]:
re.sub("(.+)( of)","\1Federation \2and",new_sentence)

'\x01Federation \x02and Armenia received its independence in 1991000 1991'

In [25]:
mj_song  = """
She was more like a beauty queen from a movie scene
I said don't mind, but what do you mean, I am the one
Who will dance on the floor in the round
She said I am the one, who will dance on the floor in the round
"""

In [26]:
type(mj_song)

str

In [27]:
print(mj_song.replace("She","He").replace("queen","king"))


He was more like a beauty king from a movie scene
I said don't mind, but what do you mean, I am the one
Who will dance on the floor in the round
He said I am the one, who will dance on the floor in the round



In [28]:
re.sub("She","",mj_song)

"\n was more like a beauty queen from a movie scene\nI said don't mind, but what do you mean, I am the one\nWho will dance on the floor in the round\n said I am the one, who will dance on the floor in the round\n"

In [29]:
eminem_movie = """8 Mile opened at No. 1 with $51,240,555 in its opening weekend, the then second highest opening for an R-rated movie in the U.S.[4] The film would go on to gross $116,750,901 domestically, and $126,124,177 overseas for a total of $242,875,078 worldwide.[2] The film's final domestic gross would hold the film at No. 3 in Box Office Mojo's "Pop Star Debuts" list, behind Austin Powers in Goldmember (Beyoncé) and The Bodyguard (Whitney Houston)."""

In [31]:
for i in eminem_movie.split():
    if i[0]=="$":
        print(int(i.replace(",","")[1:]))

51240555
116750901
126124177
242875078


In [32]:
re.findall("\$(\S+)",eminem_movie.replace(",",""))

['51240555', '116750901', '126124177', '242875078']

In [36]:
url = "http://books.toscrape.com/"
page = requests.get(url)
response = TextResponse(url=page.url,body=page.text,encoding="utf-8")

In [37]:
response.css("p.price_color::text").re("\d.*\d")

['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
 '22.65',
 '33.34',
 '17.93',
 '22.60',
 '52.15',
 '13.99',
 '20.66',
 '17.46',
 '52.29',
 '35.02',
 '57.25',
 '23.88',
 '37.59',
 '51.33',
 '45.17']

In [48]:
response.css("p[class^='star-rating']::attr(class)").extract()


['star-rating Three',
 'star-rating One',
 'star-rating One',
 'star-rating Four',
 'star-rating Five',
 'star-rating One',
 'star-rating Four',
 'star-rating Three',
 'star-rating Four',
 'star-rating One',
 'star-rating Two',
 'star-rating Four',
 'star-rating Five',
 'star-rating Five',
 'star-rating Five',
 'star-rating Three',
 'star-rating One',
 'star-rating One',
 'star-rating Two',
 'star-rating Two']

In [47]:
response.css("p[class^='star-rating']::attr(class)").re('^star-rating.+')

['star-rating Three',
 'star-rating One',
 'star-rating One',
 'star-rating Four',
 'star-rating Five',
 'star-rating One',
 'star-rating Four',
 'star-rating Three',
 'star-rating Four',
 'star-rating One',
 'star-rating Two',
 'star-rating Four',
 'star-rating Five',
 'star-rating Five',
 'star-rating Five',
 'star-rating Three',
 'star-rating One',
 'star-rating One',
 'star-rating Two',
 'star-rating Two']

# IMDB

In [49]:
imdb_url = "https://www.imdb.com/chart/top?ref_=nv_mv_250"
page = requests.get(imdb_url)
response = TextResponse(url=page.url,body=page.text,encoding="utf-8")

In [51]:
name = response.css("td.titleColumn > a::text").extract()
href = response.css("td.titleColumn > a::attr(href)").extract()
year = response.css("span.secondaryInfo::text").re("\d{4}")## getting only first 4  digits
rating = response.css("td[class='ratingColumn imdbRating'] > strong::text").extract()

In [53]:
print(name,"\n",href,"\n",year,"\n",rating)#\n stands for new line

['The Shawshank Redemption', 'The Godfather', 'The Godfather: Part II', 'The Dark Knight', '12 Angry Men', "Schindler's List", 'The Lord of the Rings: The Return of the King', 'Pulp Fiction', 'Il buono, il brutto, il cattivo', 'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring', 'Forrest Gump', 'Inception', 'Star Wars: Episode V - The Empire Strikes Back', 'The Lord of the Rings: The Two Towers', "One Flew Over the Cuckoo's Nest", 'Goodfellas', 'The Matrix', 'Shichinin no samurai', 'Avengers: Endgame', 'Se7en', 'Cidade de Deus', 'Star Wars', 'The Silence of the Lambs', "It's a Wonderful Life", 'La vita è bella', 'Sen to Chihiro no kamikakushi', 'Saving Private Ryan', 'The Usual Suspects', 'Léon', 'The Green Mile', 'Interstellar', 'Psycho', 'American History X', 'City Lights', 'Casablanca', 'Once Upon a Time in the West', 'The Pianist', 'Modern Times', 'The Intouchables', 'The Departed', 'Back to the Future', 'Terminator 2: Judgment Day', 'The Lion King', 'Whiplash', 'Rear

In [54]:
before_1990 = [i for i in year if int(i)<1990]

In [58]:
print(before_1990)

['1972', '1974', '1957', '1966', '1980', '1975', '1954', '1977', '1946', '1960', '1931', '1942', '1968', '1936', '1985', '1954', '1981', '1979', '1979', '1988', '1988', '1940', '1950', '1964', '1957', '1980', '1957', '1986', '1984', '1981', '1941', '1958', '1959', '1983', '1984', '1931', '1968', '1962', '1971', '1952', '1944', '1976', '1987', '1948', '1962', '1921', '1973', '1983', '1965', '1960', '1927', '1975', '1989', '1950', '1950', '1959', '1961', '1988', '1948', '1952', '1980', '1963', '1974', '1988', '1985', '1949', '1961', '1925', '1957', '1957', '1954', '1980', '1939', '1982', '1954', '1957', '1926', '1939', '1982', '1985', '1978', '1924', '1967', '1953', '1940', '1979', '1986', '1927', '1934', '1986', '1966', '1976', '1959', '1979', '1953', '1959', '1966', '1984', '1928', '1987', '1975', '1976', '1969', '1989', '1940', '1984', '1941', '1982', '1939', '1955', '1975', '1986', '1984', '1973', '1975', '1988']


In [55]:
len(before_1990)

116

In [57]:
for i,j in zip(year,name):
    if int(i)<1990:
        print(j)

The Godfather
The Godfather: Part II
12 Angry Men
Il buono, il brutto, il cattivo
Star Wars: Episode V - The Empire Strikes Back
One Flew Over the Cuckoo's Nest
Shichinin no samurai
Star Wars
It's a Wonderful Life
Psycho
City Lights
Casablanca
Once Upon a Time in the West
Modern Times
Back to the Future
Rear Window
Raiders of the Lost Ark
Apocalypse Now
Alien
Hotaru no haka
Nuovo Cinema Paradiso
The Great Dictator
Sunset Blvd.
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
Paths of Glory
The Shining
Witness for the Prosecution
Aliens
Once Upon a Time in America
Das Boot
Citizen Kane
Vertigo
North by Northwest
Star Wars: Episode VI - Return of the Jedi
Amadeus
M - Eine Stadt sucht einen Mörder
2001: A Space Odyssey
Lawrence of Arabia
A Clockwork Orange
Singin' in the Rain
Double Indemnity
Taxi Driver
Full Metal Jacket
Ladri di biciclette
To Kill a Mockingbird
The Kid
The Sting
Scarface
Per qualche dollaro in più
The Apartment
Metropolis
Monty Python and the Holy Gr

In [59]:
movie_dict = {"name":name,"year":year,"rating":rating,"url":href}#creating dictionary

In [60]:
movie_df = pd.DataFrame(movie_dict)#every key of dictionary become column

In [61]:
movie_df.head()

Unnamed: 0,name,year,rating,url
0,The Shawshank Redemption,1994,9.2,/title/tt0111161/
1,The Godfather,1972,9.2,/title/tt0068646/
2,The Godfather: Part II,1974,9.0,/title/tt0071562/
3,The Dark Knight,2008,9.0,/title/tt0468569/
4,12 Angry Men,1957,8.9,/title/tt0050083/


In [None]:
movie_df.to_excel("movies.xlsx")

In [62]:
movie_df.rating.apply(float).mean()

8.259999999999973

In [63]:
movie_df.year.apply(int).mode()

0    1995
1    2014
dtype: int64