In [134]:
import pandas as pd
import re
import numpy as np
from tqdm import tqdm
from nltk import word_tokenize
from fuzzywuzzy import fuzz
from collections import Counter
from string import punctuation
from string import digits
from stop_words import get_stop_words
from nltk.corpus import stopwords
from fuzzywuzzy import process

In [135]:
punct = punctuation+'©«»—…“”*№– ' + digits
stop_words_list = get_stop_words('ru')
stop_words_list2 = stopwords.words('russian')
newlist = stop_words_list + stop_words_list2
for word in ["свой"]:
    newlist.append(word)
newlist_set = set(newlist)

In [None]:
df = pd.read_csv('result_tar.gz', compression='gzip', sep='\t', error_bad_lines=False)

In [137]:
df = df.drop(["url", "video_url", "source_name", "author_name"], axis=1)
df = df.dropna()
df = df.drop_duplicates()
df = df.reset_index(drop=True)

In [138]:
def which_language (text):
    if bool(re.search('[\u0400-\u04FF]', text)) == True:
        return 'russian'
    else:
        return 'english'
    
lang=[]
for i,text in enumerate(df['result.tsv'].values): 
    lang.append(which_language(str(text)))
    
df['Language'] = lang
grouped=df.groupby('Language')
for name, group in grouped: 
    if name == 'english': 
        english = group.drop('Language',axis=1) 
    else: 
        russian = group.drop('Language',axis=1)


In [139]:
unique_leagues = russian['league_name_in_russian'].unique().tolist()
len(unique_leagues)

288

In [140]:
unique_teams1 = russian['team_of_season_1_name_in_russian'].unique().tolist()
len(unique_teams1)

3773

In [141]:
unique_teams2 = russian['team_of_season_2_name_in_russian'].unique().tolist()
len(unique_teams2)

3730

In [142]:
unique_teams = unique_teams1 + unique_teams2
unique_teams_set = set(unique_teams)
len(unique_teams_set)

4422

In [143]:
russian = pd.read_csv('russian_lists_and_directmatch.csv')

In [144]:
russian = russian[russian['result.tsv'].map(len) > 30]

In [145]:
russian = russian.reset_index(drop=True)

In [146]:
russian.shape

(524733, 10)

In [147]:
russian.head(3)

Unnamed: 0,result.tsv,odd,tip_name,bookmaker_name,match_date_time,league_name_in_russian,team_of_season_1_name_in_russian,team_of_season_2_name_in_russian,list_direct_match_for_teams1,list_direct_match_for_teams2
0,Сборная Турции продолжает свою дорогу на Чемпи...,1.65,"- 1,5",Олимп,2019-06-02 18:00:00,Международные товарищеские матчи,Турция,Узбекистан,0.0,1.0
1,Двадцать девятого числа состоится один весьма ...,1.52,"– тотал меньше 2,5",Winline,2019-06-28 23:20:00,Кубок Америки,Колумбия,Чили,0.0,0.0
2,АЗ Алкмар В первом матче голландский клуб созд...,1.85,АЗ Алкмар забьет в обоих таймах,Лига Ставок,2019-08-15 18:30:00,Лига Европы УЕФА,АЗ Алкмаар,Мариуполь,0.0,0.0


In [148]:
subset_russian = russian.sample(n=10000)
subset_russian.head(5)


Unnamed: 0,result.tsv,odd,tip_name,bookmaker_name,match_date_time,league_name_in_russian,team_of_season_1_name_in_russian,team_of_season_2_name_in_russian,list_direct_match_for_teams1,list_direct_match_for_teams2
8190,Леганес в этом сезоне неплохо умеет выжимать р...,2.34,1X,Леон,2019-04-15 19:00:00,Примера,Леганес,Реал Мадрид,1.0,0.0
440344,"Сегодня состоится матч по Футболу, в 2 - Дивиз...",1.48,Ф1 (0),1xСтавка,2019-04-20 16:00:00,Сегунда,Альбасете,Лас-Пальмас,0.0,0.0
291808,Квалификация Евро. Финляндия против Италии. Ду...,1.72,П2,1xСтавка,2019-09-08 18:45:00,Чемпионат Европы. Квалификация,Финляндия,Италия,0.0,0.0
234610,"Арсенал дома, команда сильнее и увереннее",1.444,П1,Winline,2019-09-22 15:30:00,Премьер-лига,Арсенал,Астон Вилла,0.0,0.0
433349,барселона проиграла лигу чемпионов она шас буд...,4.1,Ф1 (-2),Леон,2019-05-25 19:00:00,Кубок Испании,Барселона,Валенсия,0.0,0.0


In [149]:
subset_russian = subset_russian.reset_index(drop=True)
subset_russian.head(5)

Unnamed: 0,result.tsv,odd,tip_name,bookmaker_name,match_date_time,league_name_in_russian,team_of_season_1_name_in_russian,team_of_season_2_name_in_russian,list_direct_match_for_teams1,list_direct_match_for_teams2
0,Леганес в этом сезоне неплохо умеет выжимать р...,2.34,1X,Леон,2019-04-15 19:00:00,Примера,Леганес,Реал Мадрид,1.0,0.0
1,"Сегодня состоится матч по Футболу, в 2 - Дивиз...",1.48,Ф1 (0),1xСтавка,2019-04-20 16:00:00,Сегунда,Альбасете,Лас-Пальмас,0.0,0.0
2,Квалификация Евро. Финляндия против Италии. Ду...,1.72,П2,1xСтавка,2019-09-08 18:45:00,Чемпионат Европы. Квалификация,Финляндия,Италия,0.0,0.0
3,"Арсенал дома, команда сильнее и увереннее",1.444,П1,Winline,2019-09-22 15:30:00,Премьер-лига,Арсенал,Астон Вилла,0.0,0.0
4,барселона проиграла лигу чемпионов она шас буд...,4.1,Ф1 (-2),Леон,2019-05-25 19:00:00,Кубок Испании,Барселона,Валенсия,0.0,0.0


### Команда 1

In [None]:
for i, text in tqdm(enumerate(subset_russian['result.tsv'].values)):
    if subset_russian['list_direct_match_for_teams1'][i] == 1:
        continue
    else:
        #print(i)
        capital_words_ru = re.findall(r'([А-Я][а-я]+)', text)
        quotes = re.findall(r'\"(.+?)\"', text)
        quotes2 = re.findall(r'«(.+?)»', text)
        quotes3 = re.findall(r'\'(.+?)\'', text)
        search_words = capital_words_ru + quotes + quotes2 + quotes3
        search_words = set(search_words)
        good_words = [word for word in search_words if word.lower() not in newlist_set] 
        #print(good_words)
    
        for word in good_words:
            for team in unique_teams:
                if fuzz.ratio(word, team) > 85:
                    #print(russian['team_of_season_2_name_in_russian'][i], word, team, fuzz.ratio(word, team))
                    subset_russian['list_direct_match_for_teams1'][i] = 1

                    break


In [65]:
success = subset_russian[subset_russian['list_direct_match_for_teams1'] == 1].shape[0]
print(success)
all_vals = subset_russian['list_direct_match_for_teams1'].shape[0]
print(all_vals)
baseline_team1 = success / all_vals
print(baseline_team1)

4413
10000
0.4413


In [None]:
for i, text in tqdm(enumerate(subset_russian['result.tsv'].values)):
    if subset_russian['list_direct_match_for_teams1'][i] == 1:
        continue
    else:
        #print(i)
        capital_words_ru = re.findall(r'([А-Я][а-я]+)', text)
        quotes = re.findall(r'\"(.+?)\"', text)
        quotes2 = re.findall(r'«(.+?)»', text)
        quotes3 = re.findall(r'\'(.+?)\'', text)
        search_words = capital_words_ru + quotes + quotes2 + quotes3
        search_words = set(search_words)
        good_words = [word for word in search_words if word.lower() not in newlist_set] 
#         print(i, good_words)
#         print(subset_russian['team_of_season_1_name_in_russian'][i])
        
    
        for word in good_words:
            for team in unique_teams:
                if fuzz.ratio(word, team) > 65 and fuzz.ratio(word, subset_russian['team_of_season_1_name_in_russian'][i]) > 65:
#                     print(subset_russian['team_of_season_1_name_in_russian'][i], word, team, fuzz.ratio(word, team))
                    subset_russian['list_direct_match_for_teams1'][i] = 1

                    break

In [152]:
success = subset_russian[subset_russian['list_direct_match_for_teams1'] == 1].shape[0]
print(success)
all_vals = subset_russian['list_direct_match_for_teams1'].shape[0]
print(all_vals)
baseline_team1 = success / all_vals
print(baseline_team1)

6457
10000
0.6457


### Команда 2 

In [None]:
for i, text in tqdm(enumerate(subset_russian['result.tsv'].values)):
    if subset_russian['list_direct_match_for_teams2'][i] == 1:
        continue
    else:
        #print(i)
        capital_words_ru = re.findall(r'([А-Я][а-я]+)', text)
        quotes = re.findall(r'\"(.+?)\"', text)
        quotes2 = re.findall(r'«(.+?)»', text)
        quotes3 = re.findall(r'\'(.+?)\'', text)
        search_words = capital_words_ru + quotes + quotes2 + quotes3
        search_words = set(search_words)
        good_words = [word for word in search_words if word.lower() not in newlist_set] 
        #print(good_words)
    
        for word in good_words:
            for team in unique_teams:
                if fuzz.ratio(word, team) > 85 and subset_russian['team_of_season_2_name_in_russian'][i] == word:
                    #print(russian['team_of_season_2_name_in_russian'][i], word, team, fuzz.ratio(word, team))
                    subset_russian['list_direct_match_for_teams2'][i] = 1

                    break


In [66]:
success = subset_russian[subset_russian['list_direct_match_for_teams2'] == 1].shape[0]
print(success)
all_vals = subset_russian['list_direct_match_for_teams2'].shape[0]
print(all_vals)
baseline_team2 = success / all_vals
print(baseline_team2)

4144
10000
0.4144


In [None]:
for i, text in tqdm(enumerate(subset_russian['result.tsv'].values)):
    if subset_russian['list_direct_match_for_teams2'][i] == 1:
        continue
    else:
        #print(i)
        capital_words_ru = re.findall(r'([А-Я][а-я]+)', text)
        quotes = re.findall(r'\"(.+?)\"', text)
        quotes2 = re.findall(r'«(.+?)»', text)
        quotes3 = re.findall(r'\'(.+?)\'', text)
        search_words = capital_words_ru + quotes + quotes2 + quotes3
        search_words = set(search_words)
        good_words = [word for word in search_words if word.lower() not in newlist_set] 
        #print(good_words)
    
        for word in good_words:
            for team in unique_teams:
                if fuzz.ratio(word, team) > 65 and fuzz.ratio(word, subset_russian['team_of_season_2_name_in_russian'][i]) > 65:                    #print(russian['team_of_season_2_name_in_russian'][i], word, team, fuzz.ratio(word, team))
                    subset_russian['list_direct_match_for_teams2'][i] = 1

                    break

In [154]:
success = subset_russian[subset_russian['list_direct_match_for_teams2'] == 1].shape[0]
print(success)
all_vals = subset_russian['list_direct_match_for_teams2'].shape[0]
print(all_vals)
baseline_team2 = success / all_vals
print(baseline_team2)

6209
10000
0.6209


In [155]:
subset_russian.to_csv(r'C:\Users\User\Desktop\project\fuzzylistsnew20106.csv', index = False)

### Лига

In [83]:
leagues_match = np.zeros(subset_russian.shape[0])

In [None]:
for i, text in tqdm(enumerate(subset_russian['result.tsv'].values)):
    #print(i)
    capital_words_ru = re.findall(r'([А-Я][а-я]+)', text)
    quotes = re.findall(r'\"(.+?)\"', text)
    quotes2 = re.findall(r'«(.+?)»', text)
    quotes3 = re.findall(r'\'(.+?)\'', text)
    search_words = capital_words_ru + quotes + quotes2 + quotes3
    search_words = set(search_words)
    good_words = [word for word in search_words if word.lower() not in newlist_set] 
    #print(good_words)
    
    for word in good_words:
        for league in unique_leagues:
            if fuzz.ratio(word, league) > 50 and fuzz.ratio(word, subset_russian['league_name_in_russian'][i]) > 50:
                print(subset_russian['league_name_in_russian'][i], word, league, fuzz.ratio(word, league))
                #print('yes')
                leagues_match[i] = 1

                break

In [85]:
subset_russian['list_fuzzy_match_for_league'] = leagues_match

In [86]:
success = subset_russian[subset_russian['list_fuzzy_match_for_league'] == 1].shape[0]
print(success)
all_vals = subset_russian['list_fuzzy_match_for_league'].shape[0]
print(all_vals)
baseline_league = success / all_vals
print(baseline_league)

1642
10000
0.1642


In [87]:
subset_russian.to_csv(r'C:\Users\User\Desktop\project\fuzzylists0306_withleagues.csv', index = False)