In [1]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

data19334


In [2]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. All changes under this directory will be kept even after reset. Please clean unnecessary files in time to speed up environment loading.
!ls /home/aistudio/work

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 

实验要求：
1. 观看视频，了解本实验文本分类任务的实现流程；
2. 查资料，了解TF/IDF计算方法，并回答在这段文字下面：

## **0.导入相关的包**

In [3]:
import numpy as np
import os
import re
import sklearn
import matplotlib.pyplot as plt
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import sklearn.model_selection as sk_model_selection
#NLTK连不上
# import nltk
# from nltk.corpus import stopwords
# nltk.download('punkt')
# # nltk.download()#第一次运行可能要nltk.download()
#PRD：无法使用NLTK的功能

## **1.切分和清理文本**

In [4]:
def Text_process(text,stem=True):
    #人造特征(网址+电话)
    man_made_features = [0,0]
    #是否有网址
    ###请解释下面的正则表达式
    pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')#在注释中解释这一句正则表达式
    if re.findall(pattern,text):
        man_made_features[0]=1
    #是否有电话
    if re.findall(r'[0-9]{4,}',text):#在注释中解释这一句正则表达式
        man_made_features[1]=1
    #去除缩写"'m","'re","'s"替换部分缩写
    short = ["'m","'re","'s","'ve",' c u ',"n't",' u ']
    repla = ['','','','have',' see you ',' not',' you ']
    for i,w in enumerate(short):
        text=text.replace(w,repla[i])
    #去除标点
    remove = str.maketrans('','',string.punctuation) 
    text = text.translate(remove)    
    #分词
    # tokens = nltk.word_tokenize(text)
    tokens = text.split()
    #去停用词
    # doc = [w for w in tokens if not w in stopwords.words('english')] 
    doc = tokens
    #词干提取（默认开启）
    if stem:
        s = nltk.stem.SnowballStemmer('english')
        doc = [s.stem(ws) for ws in doc]
    result = ' '.join(doc)
    return man_made_features,result

In [5]:
def File_process(filename,stem=True):
    # 标签
    labels = []
    # 特征
    features=[]
    # 邮件内容
    contents = []
    # 读取文件
    with open(filename,'r',encoding='utf-8') as f:
        # 逐行读取
        lines = f.readlines()
        i = 0
        for line in lines:
            # 得到训练数据和标签
            target = line.split('\t')
            # 读取标签
            label = 1 if target[0]=='ham' else 0
            feature,content = Text_process(target[1].lower(),stem)
            ###
            labels.append(label)
            features.append(feature)
            contents.append(content)
            #if len(content):
                # labels.append(label)
                # features.append(feature)
                # contents.append(content)

    return contents,features,labels

## **2.利用TFIDF构造训练集**

In [6]:
# 转换为TFIDF矩阵
#在注释中解释每一句代码的含义
def TF_IDF(contents): #  定义TFIDF向量化器，设置参数
    vec = TFIDF( # 最小文档频率，过滤掉文档频率小于该值的单词
        min_df=3,  
        max_features=None, # 最大特征数，保留出现频率最高的前N个特征 
        strip_accents='unicode', # 去除文本中的重音符号
        analyzer='word', # 分析单词
        token_pattern=r'\w{1,}', # 正则表达式匹配单词
        ngram_range=(1, 1), # n-gram 特征的范围，这里只考虑单个单词 
        use_idf=1, # 是否使用idf权重
        smooth_idf=1, # 平滑idf权重  
        sublinear_tf=1, # 将tf取对数
        stop_words='english' # 停用词列表，去除常用词汇
    )
    X = vec.fit_transform(contents) # 将输入文本转换为TFIDF矩阵
    result = pd.DataFrame(X.toarray(),columns=vec.get_feature_names()) # 将矩阵转换为DataFrame
    print("shape of data:",result.shape) # 打印矩阵的形状 
    return result,vec # 返回结果和向量化器

In [8]:
contents,features,labels=File_process(os.path.join('data','data19334','SMSSpam.txt'),False)#不提取词干，可能要等一会

In [9]:
contents[:5] #前五条数据

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif you oni',
 'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetc apply 08452810075over18',
 'u dun say so early hor you c already then say',
 'nah i do not think he goes to usf he lives around here though']

In [10]:
print(len(contents)) #数据集长度

5574


In [11]:
X_1,vec = TF_IDF(contents) # X_1：无人造特征的训练集
F = pd.DataFrame(features,columns=['网址','电话'])   # F:人造特征
X_2 = pd.concat([F,X_1],axis=1) # X_2：加入人造特征
Y = pd.DataFrame(labels,columns=['labels'])#Y:标签

shape of data: (5574, 2581)


In [12]:
print(Y.head(5))
print("无人造特征：",X_1.shape)
print("有人造特征：",X_2.shape)

   labels
0       1
1       1
2       0
3       1
4       1
无人造特征： (5574, 2581)
有人造特征： (5574, 2583)


In [13]:
baseline = sum(labels)/len(labels) #0.866 非垃圾邮件占所有邮件的 86.6%，只有结果超过 86.6%，模型才有意义
print(baseline)

0.8659849300322928


## **3.应用各种模型，对比效果**

In [14]:
#最高分：不提取词干+人造特征(网址+电话)+补集朴素贝叶斯+sigmid校准 准确率： 0.9872643820427529 
#10折交叉验证
CV = 10 #交叉验证次数，嫌弃速度慢可以改小
names = ["LR"
        ,"LR + Isotonic"
        ,"LR + Sigmoid"
        ,"Naive Bayes"
        ]

# 各类模型
models = [
        # LR
        LogisticRegression()
        # LR + Isotonic
        ,CalibratedClassifierCV(LogisticRegression(), cv=2, method='isotonic')
        # LR + Sigmoid
        ,CalibratedClassifierCV(LogisticRegression(), cv=2, method='sigmoid')
        # Naive Bayes
        ,MultinomialNB()
         ]

### **对比有无人造特征的效果**

In [15]:
print("无人造特征")
for name,model in zip(names,models):
    accs=sum(sk_model_selection.cross_val_score(model, X_1, y=Y,cv=10, n_jobs=-1
                                                #,scoring='roc_auc'
                                                ))/10   
    print(name,'交叉验证结果:',accs)

无人造特征
LR 交叉验证结果: 0.9660910664530284
LR + Isotonic 交叉验证结果: 0.9752398602343583
LR + Sigmoid 交叉验证结果: 0.9750603270207139
Naive Bayes 交叉验证结果: 0.9786503478053834


In [19]:
print("有人造特征")
for name,model in zip(names,models):
    accs=sum(sk_model_selection.cross_val_score(model, X_2, y=Y,cv=CV, n_jobs=-1
                                                #,scoring='roc_auc'
                                                ))/10    
    print(name,'交叉验证结果:',accs)

有人造特征
LR 交叉验证结果: 0.9773932935657612
LR + Isotonic 交叉验证结果: 0.9838529500717488
LR + Sigmoid 交叉验证结果: 0.9779315714625844
Naive Bayes 交叉验证结果: 0.9874413621358661
