In [1]:
import pandas as pd
import re as re

In [2]:
pd.set_option('display.max_columns', 10)
df = pd.DataFrame({
    'company_code': ['c000@1.','c000,2','c0003', 'c0003#', 'c0004,'],
    'year': ['year 1800','year 1700','year 2300', 'year 1900', 'year 2200']
    })
print("Original DataFrame:")
print(df)
def find_punctuations(text):
    result = re.findall(r'[!"\$%&\'()*+,\-.\/:;=#@?\[\\\]^_`{|}~]*', text)
    string="".join(result)
    return list(string)
df['nonalpha']=df['company_code'].apply(lambda x: find_punctuations(x))
print("\nExtracting punctuation:")
print(df)

Original DataFrame:
  company_code       year
0      c000@1.  year 1800
1       c000,2  year 1700
2        c0003  year 2300
3       c0003#  year 1900
4       c0004,  year 2200

Extracting punctuation:
  company_code       year nonalpha
0      c000@1.  year 1800   [@, .]
1       c000,2  year 1700      [,]
2        c0003  year 2300       []
3       c0003#  year 1900      [#]
4       c0004,  year 2200      [,]


In [3]:
df = pd.DataFrame({
    'text_code': ['t0001.','t0002','t0003', 't0004'],
    'text_lang': ['She livedd a long life.', 'How oold is your father?', 'What is tthe problem?','TThhis desk is used by Tom.']
    })
print("Original DataFrame:")
print(df)
def rep_char(str1):
    tchr = str1.group(0)
    if len(tchr) > 1:
        return tchr[0:1] # can change the value here on repetition
def unique_char(rep, sent_text):
    convert = re.sub(r'(\w)\1+', rep, sent_text) 
    return convert
df['normal_text']=df['text_lang'].apply(lambda x : unique_char(rep_char,x))
print("\nRemove repetitive characters:")
print(df)

Original DataFrame:
  text_code                    text_lang
0    t0001.      She livedd a long life.
1     t0002     How oold is your father?
2     t0003        What is tthe problem?
3     t0004  TThhis desk is used by Tom.

Remove repetitive characters:
  text_code                    text_lang                normal_text
0    t0001.      She livedd a long life.     She lived a long life.
1     t0002     How oold is your father?    How old is your father?
2     t0003        What is tthe problem?       What is the problem?
3     t0004  TThhis desk is used by Tom.  This desk is used by Tom.


In [4]:
df = pd.DataFrame({
    'company_code': ['c0001','c0002','c0003', 'c0003', 'c0004'],
    'address': ['7277 Surrey Ave.1111','920 N. Bishop Ave.','9910 Golden Star St.', '1025 Dunbar St.', '1700 West Livingston Court']
    })
print("Original DataFrame:")
print(df)
def test_num_great(text): 
    result = re.findall(r'95[5-9]|9[6-9]\d|[1-9]\d{3,}',text)
    return " ".join(result)
df['num_great']=df['address'].apply(lambda x : test_num_great(x))
print("\nNumber greater than 940:")
print(df)

Original DataFrame:
  company_code                     address
0        c0001        7277 Surrey Ave.1111
1        c0002          920 N. Bishop Ave.
2        c0003        9910 Golden Star St.
3        c0003             1025 Dunbar St.
4        c0004  1700 West Livingston Court

Number greater than 940:
  company_code                     address  num_great
0        c0001        7277 Surrey Ave.1111  7277 1111
1        c0002          920 N. Bishop Ave.           
2        c0003        9910 Golden Star St.        991
3        c0003             1025 Dunbar St.       1025
4        c0004  1700 West Livingston Court       1700


In [5]:
df = pd.DataFrame({
    'company_code': ['c0001','c0002','c0003', 'c0003', 'c0004'],
    'address': ['72 Surrey Ave.11','92 N. Bishop Ave.','9910 Golden Star St.', '102 Dunbar St.', '17 West Livingston Court']
    })
print("Original DataFrame:")
print(df)

def test_num_less(n):
    nums = []
    for i in n.split():
        result = re.findall(r'\b(0*(?:[1-9][0-9]?|100))\b',i)
        nums.append(result)
        all_num=[",".join(x) for x in nums if x != []]
    return " ".join(all_num)

df['num_less'] = df['address'].apply(lambda x : test_num_less(x))
print("\nNumber less than 100:")
print(df)

Original DataFrame:
  company_code                   address
0        c0001          72 Surrey Ave.11
1        c0002         92 N. Bishop Ave.
2        c0003      9910 Golden Star St.
3        c0003            102 Dunbar St.
4        c0004  17 West Livingston Court

Number less than 100:
  company_code                   address num_less
0        c0001          72 Surrey Ave.11    72 11
1        c0002         92 N. Bishop Ave.       92
2        c0003      9910 Golden Star St.         
3        c0003            102 Dunbar St.         
4        c0004  17 West Livingston Court       17


In [6]:
df = pd.DataFrame({
    'company_code': ['c0001','c0002','c0003', 'c0003', 'c0004'],
    'address': ['9910 Surrey Ave.','92 N. Bishop Ave.','9910 Golden Star Ave.', '102 Dunbar St.', '17 West Livingston Court']
    })
print("Original DataFrame:")
print(df)
def test_and_cond(text):
    result = re.findall(r'(?=.*Ave.)(?=.*9910).*', text) 
    return " ".join(result)
df['check_two_words']=df['address'].apply(lambda x : test_and_cond(x))
print("\nPresent two words!")
print(df)

Original DataFrame:
  company_code                   address
0        c0001          9910 Surrey Ave.
1        c0002         92 N. Bishop Ave.
2        c0003     9910 Golden Star Ave.
3        c0003            102 Dunbar St.
4        c0004  17 West Livingston Court

Present two words!
  company_code                   address        check_two_words
0        c0001          9910 Surrey Ave.       9910 Surrey Ave.
1        c0002         92 N. Bishop Ave.                       
2        c0003     9910 Golden Star Ave.  9910 Golden Star Ave.
3        c0003            102 Dunbar St.                       
4        c0004  17 West Livingston Court                       
