In [22]:
import pandas
import psycopg2
from sklearn.feature_extraction.text import TfidfVectorizer
from typing import List

from sklearn.metrics import roc_auc_score

from utils.nlp.word_cut import tokenizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from getpass import getpass

conn = psycopg2.connect(
    database="resman",
    user="resman",
    password=getpass("Input db password"),
    host="172.30.0.59",
    port="5432"
)

In [23]:

def creating_vectorizor(titles: List[str]) -> TfidfVectorizer:
    print(f"Training from {len(titles)} titles")
    return CountVectorizer(
        analyzer=tokenizer,
        lowercase=False,
        max_df=0.95,
        min_df=3,
    ).fit(titles)

In [24]:
vectorizor = creating_vectorizor(
    pandas.read_sql("select title from public.data_imagelist", conn)["title"].tolist()
)

Training from 35875 titles


In [25]:
df = pandas.read_sql("""
SELECT
    title,
    (click_count>2 OR positive_reaction is true OR fetch_rate>0.85) as y
FROM
    public.imagelist_master
WHERE
    user_id=1 AND (impression_count>0 OR click_count>0 OR positive_reaction is true)
""", conn)

X_train, X_test, y_train, y_test = train_test_split(
    vectorizor.transform(df["title"].tolist()), df["y"].to_numpy(), test_size=0.15, random_state=0
)
C = 4
lr_model = LogisticRegression(
    C=C, penalty='l1', solver='liblinear',class_weight="balanced",max_iter=1000
)
lr_model.fit(X_train, y_train)
print("C={} Test score={}, train score={}".format(
    C,
    roc_auc_score(y_test, lr_model.decision_function(X_test)),
    roc_auc_score(y_train, lr_model.decision_function(X_train))
))

C=4 Test score=0.7809431524547804, train score=0.9991039426523296


In [26]:
features = sorted(
    list(zip(vectorizor.get_feature_names(), lr_model.coef_[0].tolist())),
    key=lambda x:-x[1]
)

In [27]:
features[:40]

[('多肉', 8.604532271775538),
 ('微胖', 6.2939372687111685),
 ('实现', 5.622967852460113),
 ('韵味', 5.591806119353699),
 ('欠干', 5.431505576988046),
 ('小萝莉', 5.252513094728178),
 ('93', 5.075816201911905),
 ('体会', 4.840009847888401),
 ('看不到', 4.792743497066377),
 ('难忍', 4.7610244920354345),
 ('·', 4.700818592630982),
 ('无毛', 4.633792680526883),
 ('天天', 4.61396263138857),
 ('最舒服', 4.612184033485558),
 ('黑人', 4.5165414460811935),
 ('含泪', 4.479068527959245),
 ('观音坐莲', 4.445521398339773),
 ('娘们', 4.439230096575159),
 ('丰满', 4.4171651870893704),
 ('几吧', 4.382422114821367),
 ('一辆', 4.371113815675124),
 ('馒头', 4.361352080464563),
 ('过瘾', 4.315477942146212),
 ('空虚', 4.279875334321422),
 ('输出', 4.2236684409297025),
 ('碰', 4.20241044446053),
 ('肉体', 4.168072604258457),
 ('一塌糊涂', 4.156155494959358),
 ('城中村', 4.075410598086435),
 ('国庆', 4.0680846489278135),
 ('胖子', 4.0314154867191565),
 ('纯', 3.930387949624208),
 ('大胸', 3.8688245133200097),
 ('硕', 3.8524871812305146),
 ('PT', 3.8390351673560117),
 ('姐妹', 

In [28]:
features[-20:]

[('鲍鱼', -2.968748909165689),
 ('不想', -3.052615264773241),
 ('就算', -3.053664470130807),
 ('手指', -3.055493799789334),
 ('豪乳', -3.069950300846224),
 ('撩妹', -3.222072987181318),
 ('雪白', -3.254665977075379),
 ('可惜', -3.2952820541441707),
 ('穿着', -3.3631627189451705),
 ('配合', -3.410935180834021),
 ('拍摄', -3.427544876877342),
 ('引导', -3.731118857187758),
 ('美穴', -3.7781701036542987),
 ('丁字裤', -3.826494428445469),
 ('骚气', -4.097246298353958),
 ('穿', -4.449563403597916),
 ('强烈', -4.7784461371747335),
 ('淫荡', -4.8188427553940185),
 ('夫妻生活', -4.956902370454732),
 ('唯美', -6.692442092771125)]