***
Python Regular Expression Quick Guide
^ Matches the beginning of a line
$ Matches the end of the line
. Matches any character
\s Matches whitespace
\S Matches any non-whitespace character
* Repeats a character zero or more times
*? Repeats a character zero or more times
(non-greedy)
+ Repeats a character one or more times
+? Repeats a character one or more times
(non-greedy)
[aeiou] Matches a single character in the listed set
[^XYZ] Matches a single character not in the listed set
[a-z0-9] The set of characters can include a range
( Indicates where string extraction is to start
) Indicates where string extraction is to end
***

### Expressions :
***
`\d`                         Any numeric digit from `0` to `9`.

`\D`                         Matches any character which is not a decimal digit.
                           This is the opposite of `\d`.

`\w`                         Any letter, numeric digit, or the underscore
                           character.  (Think of this as matching
                           "word" characters.)

`\W`                         Any character that is not a letter,
                           numeric digit, or the underscore character.

`\s`                         Any space, tab, or newline character.  (
                           Think of this as matching white-space characters.)

`\S`                         Any character that is not a space, tab,
                           or newline.
***

In [3]:
import re
import pandas as pd

In [4]:
text = "A78L41K"

In [6]:
num = re.search("\d", text)   
num

# bu textin içinde bulduğu 

<re.Match object; span=(1, 2), match='7'>

In [8]:
num = re.search("\d\d", text)   
num

<re.Match object; span=(1, 3), match='78'>

In [7]:
num.group()

'7'

In [9]:
num = re.search("\w", text)   
num

<re.Match object; span=(0, 1), match='A'>

In [10]:
num = re.search("\w\w", text)   
num

<re.Match object; span=(0, 2), match='A7'>

In [11]:
text = "8PM19MIN"

In [15]:
nondigi = re.search("\D",text)
print(nondigi)
print(nondigi.group())

<re.Match object; span=(1, 2), match='P'>
P


In [27]:
text = 'My phone number is 555 666 7777'   # n = re.findall(‘[\d]+’, text)

In [28]:
telno = re.search("\d\d\d \d\d\d \d\d\d\d", text)
print(telno)
print(telno.group())

<re.Match object; span=(19, 31), match='555 666 7777'>
555 666 7777


In [34]:
text = 'My phone number is 415-555-1212'

In [32]:
telno = re.search("\d\d\d-\d\d\d-\d\d\d\d", text)
print(telno)
print(telno.group())

# text = ‘My phone number is 415-555-1212’
# s = re.search(‘[\d]+[^-].*’, text)


<re.Match object; span=(19, 31), match='415-555-1212'>
415-555-1212


In [36]:
telno = re.search("\d"*3 + "-" + "\d"*3 + "-" + "\d"*4, text)
print(telno)
print(telno.group())

<re.Match object; span=(19, 31), match='415-555-1212'>
415-555-1212


In [41]:
telno = re.search("(\d\d\d)-(\d\d\d)-(\d\d\d\d)", text)  # parantezin görevi gruplara bölmek. 
print(telno)
print(telno.group(0))  # burada 0 yazınca hepsini, 1 olunca birinci grubu, 2 olunca ikinci grubu

<re.Match object; span=(19, 31), match='415-555-1212'>
415-555-1212


In [44]:
with open("text.txt","w") as file:
    file.write(text)

In [51]:
with open("text.txt","r") as file:
    txt = file.read()
print(txt)

output = re.search("(\d\d\d)-(\d\d\d-\d\d\d\d)",txt)
print(output.group(1))
print(output.group(2))

My phone number is 415-555-1212
415
555-1212


In [53]:
value = "O 1, t 10, o 100. 100000"   

In [54]:
output = re.findall("\d{1}", value)  # süslü parantez içine 1 yazarak sadece birer olanları buluyor.
print(output)                       # süslü parantez içine 2 yazarsak ikili olanları bulur.
# suslu parantez içine 1,6 yazarsak sırasıyla birer ikişer üçer ... altışar olanları yazar.

['1', '1', '0', '1', '0', '0', '1', '0', '0', '0', '0', '0']


In [None]:
# sayi = re.findall('[0-9]+', value)

In [57]:
phone = "2004-959-559 # This is Phone Number"

In [61]:
output = re.sub("\D", "*", phone) # İlki neyi değiştereceğiz ikincisi ise ne ile değiştereceğiz. \D digit olmayanları değiştirdi.
print(output)

2004*959*559***********************


In [63]:
output = re.sub("\d", "~", phone)  # \d digit olanları değiştirdi.
print(output)

~~~~-~~~-~~~ # This is Phone Number


### Special Characters
___
``"[]"``	  A set of characters	``"[a-m]"``

``"\"``	      Signals a special sequence (can also be used to escape special characters)

``"."``	      Any character (except newline character)

``"^"``	      Starts with	``"^hello"``

``"$"``	      Ends with	``"world$"``

``"*"``	      Zero or more occurrences

`"+"`	      One or more occurrences

`"{}"`	  Exactly the specified number of occurrences

`"|"`	      Either or	`"falls|stays"`

`"()"`	  Capture and group
___

In [65]:
txt = "1 person against 100 people"

In [66]:
output = re.findall("\d+",txt)
print(output)

['1', '100']


In [67]:
txt = "hello world"

In [68]:
output = re.findall("^h",txt)
print(output)

['h']


In [69]:
output = re.findall("^hello",txt)
print(output)

['hello']


In [70]:
output = re.findall("world$",txt)
print(output)

['world']


In [71]:
s = pd.Series(['a3', 'b4', 'c5', 'd'])

In [72]:
s.str.contains(("\d"))

0     True
1     True
2     True
3    False
dtype: bool

In [None]:
s.apply(lambda x:)

In [73]:
s.str.extract("(\d)")  #str üzerin extract methodu ile istediklerimizi çıkarıyoruz.

Unnamed: 0,0
0,3.0
1,4.0
2,5.0
3,


In [74]:
s.str.extract("(\D)")

Unnamed: 0,0
0,a
1,b
2,c
3,d


In [76]:
s.str.extract("(\w)")

Unnamed: 0,0
0,a
1,b
2,c
3,d


In [82]:
s = pd.Series(['a3aac', 'b4aaa', 'c5aae'])

In [83]:
s.str.extract("(\w)\d(\D)(\D)(\D)")

Unnamed: 0,0,1,2,3
0,a,a,a,c
1,b,a,a,a
2,c,a,a,e


In [89]:
s= pd.Series(['40 l/100 km (comb)',
        '38 l/100 km (comb)', '6.4 l/100 km (comb)',
       '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)',
       '5.4 l/100 km (comb)', '6.7 l/100 km (comb)',
       '6.2 l/100 km (comb)', '7.3 l/100 km (comb)',
       '6.3 l/100 km (comb)', '5.7 l/100 km (comb)',
       '6.1 l/100 km (comb)', '6.8 l/100 km (comb)',
       '7.5 l/100 km (comb)', '7.4 l/100 km (comb)',
       '3.6 kg/100 km (comb)', '0 l/100 km (comb)',
       '7.8 l/100 km (comb)'])

In [86]:
s

0       40 l/100 km (comb)
1       38 l/100 km (comb)
2      6.4 l/100 km (comb)
3     8.3 kg/100 km (comb)
4     5.1 kg/100 km (comb)
5      5.4 l/100 km (comb)
6      6.7 l/100 km (comb)
7      6.2 l/100 km (comb)
8      7.3 l/100 km (comb)
9      6.3 l/100 km (comb)
10     5.7 l/100 km (comb)
11     6.1 l/100 km (comb)
12     6.8 l/100 km (comb)
13     7.5 l/100 km (comb)
14     7.4 l/100 km (comb)
15    3.6 kg/100 km (comb)
16       0 l/100 km (comb)
17     7.8 l/100 km (comb)
dtype: object

In [94]:
result = s.str.extract("(\d\d|\d.\d|\d)")  # önce ikili digitleri sonra . noktadan sonra olanları al.
print(result)

      0
0    40
1    38
2   6.4
3   8.3
4   5.1
5   5.4
6   6.7
7   6.2
8   7.3
9   6.3
10  5.7
11  6.1
12  6.8
13  7.5
14  7.4
15  3.6
16    0
17  7.8


In [97]:
result = s.str.extract("(\d\d|\d.\d|\d).+(\d\d\d)")  # nokta ondan sonra herhangi birkarakter gelebilir anlamı taşıyor.
print(result)

      0    1
0    40  100
1    38  100
2   6.4  100
3   8.3  100
4   5.1  100
5   5.4  100
6   6.7  100
7   6.2  100
8   7.3  100
9   6.3  100
10  5.7  100
11  6.1  100
12  6.8  100
13  7.5  100
14  7.4  100
15  3.6  100
16    0  100
17  7.8  100


In [98]:
result = s.str.extract("(^\d*.\d*) \w*/(\d*)")  # ilk diğit grubu ile 100 arasındakileri almasın diye \w*/ yazdık.

In [99]:
result

Unnamed: 0,0,1
0,40.0,100
1,38.0,100
2,6.4,100
3,8.3,100
4,5.1,100
5,5.4,100
6,6.7,100
7,6.2,100
8,7.3,100
9,6.3,100


In [101]:
output = s.str.extract('(\S+)')   # \S Boşluk olmayan karakter 1 veya daha fazla olacak. 
output

Unnamed: 0,0
0,40.0
1,38.0
2,6.4
3,8.3
4,5.1
5,5.4
6,6.7
7,6.2
8,7.3
9,6.3


In [102]:
s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)',
'11/2020\n\n166 g CO2/km (comb)',
'10/2019\n\n5.3 l/100 km (comb)',
'05/2022\n\n6.3 l/100 km (comb)',
'07/2019\n\n128 g CO2/km (comb)',
'06/2022\n\n112 g CO2/km (comb)',
'01/2022\n\n5.8 l/100 km (comb)',
'11/2020\n\n106 g CO2/km (comb)',
'04/2019\n\n105 g CO2/km (comb)',
'08/2020\n\n133 g CO2/km (comb)',
'04/2022\n\n133 g CO2/km (comb)'])

In [103]:
s

0     06/2020\n\n4.9 l/100 km (comb)
1     11/2020\n\n166 g CO2/km (comb)
2     10/2019\n\n5.3 l/100 km (comb)
3     05/2022\n\n6.3 l/100 km (comb)
4     07/2019\n\n128 g CO2/km (comb)
5     06/2022\n\n112 g CO2/km (comb)
6     01/2022\n\n5.8 l/100 km (comb)
7     11/2020\n\n106 g CO2/km (comb)
8     04/2019\n\n105 g CO2/km (comb)
9     08/2020\n\n133 g CO2/km (comb)
10    04/2022\n\n133 g CO2/km (comb)
dtype: object

In [104]:
result = s.str.extract("(\d+).(\d+)")
result

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022
7,11,2020
8,4,2019
9,8,2020


In [105]:
result = s.str.extract('(\S+/\S+)')
result

Unnamed: 0,0
0,06/2020
1,11/2020
2,10/2019
3,05/2022
4,07/2019
5,06/2022
6,01/2022
7,11/2020
8,04/2019
9,08/2020


In [106]:
result = s.str.extract('(\S+)/(\S+)')
result

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022
7,11,2020
8,4,2019
9,8,2020


In [108]:
result = s.str.extract('(\d{2})/(\d{4})')
result

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022
7,11,2020
8,4,2019
9,8,2020


In [109]:
result = s.str.extract('(\d{2}).(\d{4})')
result

Unnamed: 0,0,1
0,6,2020
1,11,2020
2,10,2019
3,5,2022
4,7,2019
5,6,2022
6,1,2022
7,11,2020
8,4,2019
9,8,2020
